ggerganov commited on
Commit
919a447
·
unverified ·
1 Parent(s): 5df6c6c

sync : ggml (VMM, sync-ggml-am, dotprod ARM fixes, CUDA fixes) (#1691)

Browse files

* scripts : add sync-ggml-am.sh

* sync : ggml (VMM, ARM dot prod fix, etc.)

* build : fix CUDA build

* ggml : fix some mul mat cases + add tests for src1 F16

https://github.com/ggerganov/ggml/commit/dbd02958fa4f46898f68ca29c27ddcdc58a06f98

Files changed (9) hide show
  1. CMakeLists.txt +7 -1
  2. Makefile +1 -1
  3. extra/sync-ggml-am.sh +138 -0
  4. extra/sync-ggml.last +1 -0
  5. ggml-backend.c +12 -12
  6. ggml-cuda.cu +533 -404
  7. ggml-quants.c +23 -342
  8. ggml.c +4 -7
  9. ggml.h +2 -0
CMakeLists.txt CHANGED
@@ -218,11 +218,17 @@ if (WHISPER_CUBLAS)
218
  add_compile_definitions(GGML_USE_CUBLAS)
219
 
220
  if (WHISPER_STATIC)
221
- set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
 
 
 
 
 
222
  else()
223
  set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
224
  endif()
225
 
 
226
  else()
227
  message(FATAL_ERROR "cuBLAS not found")
228
  endif()
 
218
  add_compile_definitions(GGML_USE_CUBLAS)
219
 
220
  if (WHISPER_STATIC)
221
+ if (WIN32)
222
+ # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
223
+ set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
224
+ else ()
225
+ set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
226
+ endif()
227
  else()
228
  set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
229
  endif()
230
 
231
+ set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cuda_driver)
232
  else()
233
  message(FATAL_ERROR "cuBLAS not found")
234
  endif()
Makefile CHANGED
@@ -206,7 +206,7 @@ ifdef WHISPER_CUBLAS
206
 
207
  CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
208
  CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
209
- LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib
210
  WHISPER_OBJ += ggml-cuda.o
211
  NVCC = nvcc
212
  NVCCFLAGS = --forward-unknown-to-host-compiler -arch=$(CUDA_ARCH_FLAG)
 
206
 
207
  CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
208
  CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
209
+ LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib
210
  WHISPER_OBJ += ggml-cuda.o
211
  NVCC = nvcc
212
  NVCCFLAGS = --forward-unknown-to-host-compiler -arch=$(CUDA_ARCH_FLAG)
extra/sync-ggml-am.sh ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ #
3
+ # Synchronize ggml changes to whisper.cpp
4
+ #
5
+ # Usage:
6
+ #
7
+ # $ cd /path/to/whisper.cpp
8
+ # $ ./extra/sync-ggml-am.sh
9
+ #
10
+
11
+ set -e
12
+
13
+ sd=$(dirname $0)
14
+ cd $sd/../
15
+
16
+ SRC_WHISPER=$(pwd)
17
+ SRC_GGML=$(cd ../ggml; pwd)
18
+
19
+ if [ ! -d $SRC_GGML ]; then
20
+ echo "ggml not found at $SRC_GGML"
21
+ exit 1
22
+ fi
23
+
24
+ lc=$(cat $SRC_WHISPER/extra/sync-ggml.last)
25
+ echo "Syncing ggml changes since commit $lc"
26
+
27
+ cd $SRC_GGML
28
+
29
+ git log --oneline $lc..HEAD
30
+
31
+ git format-patch $lc --stdout -- \
32
+ include/ggml/ggml*.h \
33
+ src/ggml*.h \
34
+ src/ggml*.c \
35
+ src/ggml*.cpp \
36
+ src/ggml*.m \
37
+ src/ggml*.metal \
38
+ src/ggml*.cu \
39
+ tests/test-opt.cpp \
40
+ tests/test-grad0.cpp \
41
+ tests/test-quantize-fns.cpp \
42
+ tests/test-quantize-perf.cpp \
43
+ tests/test-backend-ops.cpp \
44
+ > $SRC_WHISPER/ggml-src.patch
45
+
46
+ # delete files if empty
47
+ if [ ! -s $SRC_WHISPER/ggml-src.patch ]; then
48
+ rm -v $SRC_WHISPER/ggml-src.patch
49
+ fi
50
+
51
+ cd $SRC_WHISPER
52
+
53
+ if [ -f $SRC_WHISPER/ggml-src.patch ]; then
54
+ # replace PR numbers
55
+ #
56
+ # Subject: some text (#1234)
57
+ # Subject: some text (ggml/1234)
58
+ cat ggml-src.patch | sed -e 's/^Subject: \(.*\) (#\([0-9]*\))/Subject: \1 (ggml\/\2)/' > ggml-src.patch.tmp
59
+ mv ggml-src.patch.tmp ggml-src.patch
60
+
61
+ cat ggml-src.patch | sed -e 's/^\(.*\) (#\([0-9]*\))$/\1 (ggml\/\2)/' > ggml-src.patch.tmp
62
+ mv ggml-src.patch.tmp ggml-src.patch
63
+
64
+ # replace filenames:
65
+ #
66
+ # src/ggml.c -> ggml.c
67
+ # src/ggml-alloc.c -> ggml-alloc.c
68
+ # src/ggml-backend-impl.h -> ggml-backend-impl.h
69
+ # src/ggml-backend.c -> ggml-backend.c
70
+ # src/ggml-cuda.cu -> ggml-cuda.cu
71
+ # src/ggml-cuda.h -> ggml-cuda.h
72
+ # src/ggml-impl.h -> ggml-impl.h
73
+ # src/ggml-metal.h -> ggml-metal.h
74
+ # src/ggml-metal.m -> ggml-metal.m
75
+ # src/ggml-metal.metal -> ggml-metal.metal
76
+ # src/ggml-mpi.h -> ggml-mpi.h
77
+ # src/ggml-mpi.c -> ggml-mpi.c
78
+ # src/ggml-opencl.cpp -> ggml-opencl.cpp
79
+ # src/ggml-opencl.h -> ggml-opencl.h
80
+ # src/ggml-quants.c -> ggml-quants.c
81
+ # src/ggml-quants.h -> ggml-quants.h
82
+ # include/ggml/ggml.h -> ggml.h
83
+ # include/ggml/ggml-alloc.h -> ggml-alloc.h
84
+ # include/ggml/ggml-backend.h -> ggml-backend.h
85
+ #
86
+ # examples/common.h -> examples/common.h
87
+ # examples/common.cpp -> examples/common.cpp
88
+ # examples/common-ggml.h -> examples/common-ggml.h
89
+ # examples/common-ggml.cpp -> examples/common-ggml.cpp
90
+ #
91
+ # examples/whisper/whisper.h -> whisper.h
92
+ # examples/whisper/whisper.cpp -> whisper.cpp
93
+ # examples/whisper/main.cpp -> examples/main/main.cpp
94
+ # examples/whisper/quantize.cpp -> examples/quantize/quantize.cpp
95
+
96
+ cat ggml-src.patch | sed \
97
+ -e 's/src\/ggml\.c/ggml.c/g' \
98
+ -e 's/src\/ggml-alloc\.c/ggml-alloc.c/g' \
99
+ -e 's/src\/ggml-backend-impl\.h/ggml-backend-impl.h/g' \
100
+ -e 's/src\/ggml-backend\.c/ggml-backend.c/g' \
101
+ -e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \
102
+ -e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \
103
+ -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
104
+ -e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
105
+ -e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
106
+ -e 's/src\/ggml-metal\.metal/ggml-metal.metal/g' \
107
+ -e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \
108
+ -e 's/src\/ggml-mpi\.c/ggml-mpi.c/g' \
109
+ -e 's/src\/ggml-opencl\.cpp/ggml-opencl.cpp/g' \
110
+ -e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
111
+ -e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
112
+ -e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
113
+ -e 's/include\/ggml\/ggml\.h/ggml.h/g' \
114
+ -e 's/include\/ggml\/ggml-alloc\.h/ggml-alloc.h/g' \
115
+ -e 's/include\/ggml\/ggml-backend\.h/ggml-backend.h/g' \
116
+ -e 's/examples\/common\.h/examples\/common.h/g' \
117
+ -e 's/examples\/common\.cpp/examples\/common.cpp/g' \
118
+ -e 's/examples\/common-ggml\.h/examples\/common-ggml.h/g' \
119
+ -e 's/examples\/common-ggml\.cpp/examples\/common-ggml.cpp/g' \
120
+ -e 's/examples\/whisper\/whisper\.h/whisper.h/g' \
121
+ -e 's/examples\/whisper\/whisper\.cpp/whisper.cpp/g' \
122
+ -e 's/examples\/whisper\/main\.cpp/examples\/main\/main.cpp/g' \
123
+ -e 's/examples\/whisper\/quantize\.cpp/examples\/quantize\/quantize.cpp/g' \
124
+ > ggml-src.patch.tmp
125
+ mv ggml-src.patch.tmp ggml-src.patch
126
+
127
+ git am ggml-src.patch
128
+
129
+ rm -v $SRC_WHISPER/ggml-src.patch
130
+ fi
131
+
132
+ # update last commit
133
+ cd $SRC_GGML
134
+ git log -1 --format=%H > $SRC_WHISPER/extra/sync-ggml.last
135
+
136
+ echo "Done"
137
+
138
+ exit 0
extra/sync-ggml.last ADDED
@@ -0,0 +1 @@
 
 
1
+ 1467a4eb71bdb5ac316d248a7f3f26cdadc56b68
ggml-backend.c CHANGED
@@ -297,7 +297,7 @@ static void ggml_backend_registry_init(void) {
297
  void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
298
  GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
299
 
300
- int id = ggml_backend_registry_count;
301
 
302
  ggml_backend_registry[id] = (struct ggml_backend_reg) {
303
  /* .name = */ {0},
@@ -330,6 +330,8 @@ size_t ggml_backend_reg_find_by_name(const char * name) {
330
  return i;
331
  }
332
  }
 
 
333
  return SIZE_MAX;
334
  }
335
 
@@ -340,15 +342,15 @@ ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str)
340
  const char * params = strchr(backend_str, ':');
341
  char backend_name[128];
342
  if (params == NULL) {
343
- strcpy(backend_name, backend_str);
344
  params = "";
345
  } else {
346
- strncpy(backend_name, backend_str, params - backend_str);
347
- backend_name[params - backend_str] = '\0';
348
  params++;
349
  }
350
 
351
  size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
 
352
  if (backend_i == SIZE_MAX) {
353
  fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
354
  return NULL;
@@ -396,18 +398,12 @@ static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
396
  }
397
 
398
  static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
399
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
400
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
401
-
402
  memcpy((char *)tensor->data + offset, data, size);
403
 
404
  GGML_UNUSED(buffer);
405
  }
406
 
407
  static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
408
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
409
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
410
-
411
  memcpy(data, (const char *)tensor->data + offset, size);
412
 
413
  GGML_UNUSED(buffer);
@@ -618,10 +614,14 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
618
  }
619
 
620
  static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
621
- return true;
 
 
 
 
 
622
 
623
  GGML_UNUSED(backend);
624
- GGML_UNUSED(op);
625
  }
626
 
627
  static struct ggml_backend_i cpu_backend_i = {
 
297
  void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
298
  GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
299
 
300
+ size_t id = ggml_backend_registry_count;
301
 
302
  ggml_backend_registry[id] = (struct ggml_backend_reg) {
303
  /* .name = */ {0},
 
330
  return i;
331
  }
332
  }
333
+
334
+ // not found
335
  return SIZE_MAX;
336
  }
337
 
 
342
  const char * params = strchr(backend_str, ':');
343
  char backend_name[128];
344
  if (params == NULL) {
345
+ snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
346
  params = "";
347
  } else {
348
+ snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
 
349
  params++;
350
  }
351
 
352
  size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
353
+
354
  if (backend_i == SIZE_MAX) {
355
  fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
356
  return NULL;
 
398
  }
399
 
400
  static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
 
 
 
401
  memcpy((char *)tensor->data + offset, data, size);
402
 
403
  GGML_UNUSED(buffer);
404
  }
405
 
406
  static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
 
 
 
407
  memcpy(data, (const char *)tensor->data + offset, size);
408
 
409
  GGML_UNUSED(buffer);
 
614
  }
615
 
616
  static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
617
+ switch (op->op) {
618
+ case GGML_OP_MUL_MAT:
619
+ return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
620
+ default:
621
+ return true;
622
+ }
623
 
624
  GGML_UNUSED(backend);
 
625
  }
626
 
627
  static struct ggml_backend_i cpu_backend_i = {
ggml-cuda.cu CHANGED
@@ -68,8 +68,9 @@
68
  #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
69
  #endif
70
  #define cudaMemcpy hipMemcpy
71
- #define cudaMemcpy2DAsync hipMemcpy2DAsync
72
  #define cudaMemcpyAsync hipMemcpyAsync
 
 
73
  #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
74
  #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
75
  #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
@@ -86,17 +87,29 @@
86
  #define cudaStream_t hipStream_t
87
  #define cudaSuccess hipSuccess
88
  #define __trap abort
 
 
 
 
 
 
 
 
 
89
  #else
90
  #include <cuda_runtime.h>
 
91
  #include <cublas_v2.h>
92
  #include <cuda_fp16.h>
93
- // CUDA 10.2 does not have these macro definitions.
94
- #ifndef CUBLAS_TF32_TENSOR_OP_MATH
 
95
  #define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
96
  #define CUBLAS_COMPUTE_16F CUDA_R_16F
97
  #define CUBLAS_COMPUTE_32F CUDA_R_32F
98
  #define cublasComputeType_t cudaDataType_t
99
- #endif
 
100
  #endif // defined(GGML_USE_HIPBLAS)
101
 
102
  #include "ggml-cuda.h"
@@ -151,7 +164,7 @@ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
151
  const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
152
  #if __has_builtin(__builtin_elementwise_sub_sat)
153
  const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
154
- return reinterpret_cast<const int&>(c);
155
  #else
156
  int8x4_t c;
157
  int16_t tmp;
@@ -162,7 +175,7 @@ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
162
  if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
163
  c[i] = tmp;
164
  }
165
- return reinterpret_cast<int&>(c);
166
  #endif // __has_builtin(__builtin_elementwise_sub_sat)
167
  }
168
 
@@ -200,45 +213,59 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
200
 
201
  static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
202
 
203
- #define CUDA_CHECK(err) \
204
- do { \
205
- cudaError_t err_ = (err); \
206
- if (err_ != cudaSuccess) { \
207
- int id; \
208
- cudaGetDevice(&id); \
209
- fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
210
- cudaGetErrorString(err_)); \
211
- fprintf(stderr, "current device: %d\n", id); \
212
- GGML_ASSERT(!"CUDA error"); \
213
- } \
 
 
 
 
 
 
 
214
  } while (0)
215
 
 
 
216
  #if CUDART_VERSION >= 12000
217
- #define CUBLAS_CHECK(err) \
218
- do { \
219
- cublasStatus_t err_ = (err); \
220
- if (err_ != CUBLAS_STATUS_SUCCESS) { \
221
- int id; \
222
- cudaGetDevice(&id); \
223
- fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
224
- err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
225
- fprintf(stderr, "current device: %d\n", id); \
226
- GGML_ASSERT(!"cuBLAS error"); \
227
- } \
228
- } while (0)
229
  #else
230
- #define CUBLAS_CHECK(err) \
231
- do { \
232
- cublasStatus_t err_ = (err); \
233
- if (err_ != CUBLAS_STATUS_SUCCESS) { \
234
- int id; \
235
- cudaGetDevice(&id); \
236
- fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
237
- fprintf(stderr, "current device: %d\n", id); \
238
- GGML_ASSERT(!"cuBLAS error"); \
239
- } \
240
- } while (0)
241
- #endif // CUDART_VERSION >= 11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
  #if CUDART_VERSION >= 11100
244
  #define GGML_CUDA_ASSUME(x) __builtin_assume(x)
@@ -294,10 +321,10 @@ typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * s
294
  typedef void (*ggml_cuda_op_mul_mat_t)(
295
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
296
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
297
- const int64_t src1_padded_row_size, const cudaStream_t & stream);
298
  typedef void (*ggml_cuda_op_flatten_t)(
299
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
300
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream);
301
 
302
  // QK = number of values after dequantization
303
  // QR = QK / number of values before dequantization
@@ -503,22 +530,29 @@ struct ggml_tensor_extra_gpu {
503
 
504
  // this is faster on Windows
505
  // probably because the Windows CUDA libraries forget to make this check before invoking the drivers
506
- inline cudaError_t ggml_cuda_set_device(const int device) {
507
  int current_device;
508
  CUDA_CHECK(cudaGetDevice(&current_device));
509
 
510
  if (device == current_device) {
511
- return cudaSuccess;
512
  }
513
 
514
- return cudaSetDevice(device);
515
  }
516
 
517
  static int g_device_count = -1;
518
  static int g_main_device = 0;
519
- static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
520
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
521
 
 
 
 
 
 
 
 
 
522
  static void * g_scratch_buffer = nullptr;
523
  static size_t g_scratch_size = 0; // disabled by default
524
  static size_t g_scratch_offset = 0;
@@ -560,6 +594,7 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
560
 
561
  static __device__ __forceinline__ float op_repeat(const float a, const float b) {
562
  return b;
 
563
  }
564
 
565
  static __device__ __forceinline__ float op_add(const float a, const float b) {
@@ -681,7 +716,7 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
681
  dst[i] = x[i] / (1.0f + expf(-x[i]));
682
  }
683
 
684
- static __global__ void gelu_quick_f32(const float *x, float *dst, int k) {
685
  const float GELU_QUICK_COEF = -1.702f;
686
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
687
  if (i >= k) {
@@ -690,7 +725,7 @@ static __global__ void gelu_quick_f32(const float *x, float *dst, int k) {
690
  dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
691
  }
692
 
693
- static __global__ void tanh_f32(const float *x, float *dst, int k) {
694
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
695
  if (i >= k) {
696
  return;
@@ -707,7 +742,7 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) {
707
  dst[i] = fmaxf(x[i], 0);
708
  }
709
 
710
- static __global__ void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope) {
711
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
712
  if (i >= k) {
713
  return;
@@ -760,7 +795,7 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols, c
760
  }
761
  }
762
 
763
- static __global__ void concat_f32(const float *x,const float *y, float *dst, const int ne0, const int ne02) {
764
  int nidx = threadIdx.x + blockIdx.x * blockDim.x;
765
  if (nidx >= ne0) {
766
  return;
@@ -785,7 +820,7 @@ static __global__ void concat_f32(const float *x,const float *y, float *dst, c
785
  }
786
  }
787
 
788
- static __global__ void upscale_f32(const float *x, float *dst, const int ne00, const int nb02, const int scale_factor) {
789
  int ne0 = ne00 * scale_factor;
790
  int nidx = threadIdx.x + blockIdx.x * blockDim.x;
791
  if (nidx >= ne0) {
@@ -805,7 +840,7 @@ static __global__ void upscale_f32(const float *x, float *dst, const int ne00,
805
  dst[offset_dst] = x[offset_src];
806
  }
807
 
808
- static __global__ void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02) {
809
  int nidx = threadIdx.x + blockIdx.x * blockDim.x;
810
  if (nidx >= ne0) {
811
  return;
@@ -4707,7 +4742,6 @@ static __global__ void mul_mat_p021_f16_f32(
4707
 
4708
  const int row_y = col_x;
4709
 
4710
-
4711
  // y is not transposed but permuted
4712
  const int iy = channel*nrows_y + row_y;
4713
 
@@ -5382,7 +5416,7 @@ struct bin_bcast_cuda {
5382
  cne[3] = 1;
5383
  };
5384
 
5385
- auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
5386
  cnb[1] *= cne[1];
5387
  cnb[2] *= cne[2];
5388
  cnb[3] *= cne[3];
@@ -5875,7 +5909,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
5875
 
5876
  int id;
5877
  CUDA_CHECK(cudaGetDevice(&id));
5878
- const int compute_capability = g_compute_capabilities[id];
5879
 
5880
  int mmq_x, mmq_y, nwarps;
5881
  if (compute_capability >= CC_RDNA2) {
@@ -5920,7 +5954,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
5920
 
5921
  int id;
5922
  CUDA_CHECK(cudaGetDevice(&id));
5923
- const int compute_capability = g_compute_capabilities[id];
5924
 
5925
  int mmq_x, mmq_y, nwarps;
5926
  if (compute_capability >= CC_RDNA2) {
@@ -5965,7 +5999,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
5965
 
5966
  int id;
5967
  CUDA_CHECK(cudaGetDevice(&id));
5968
- const int compute_capability = g_compute_capabilities[id];
5969
 
5970
  int mmq_x, mmq_y, nwarps;
5971
  if (compute_capability >= CC_RDNA2) {
@@ -6010,7 +6044,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
6010
 
6011
  int id;
6012
  CUDA_CHECK(cudaGetDevice(&id));
6013
- const int compute_capability = g_compute_capabilities[id];
6014
 
6015
  int mmq_x, mmq_y, nwarps;
6016
  if (compute_capability >= CC_RDNA2) {
@@ -6055,7 +6089,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
6055
 
6056
  int id;
6057
  CUDA_CHECK(cudaGetDevice(&id));
6058
- const int compute_capability = g_compute_capabilities[id];
6059
 
6060
  int mmq_x, mmq_y, nwarps;
6061
  if (compute_capability >= CC_RDNA2) {
@@ -6100,7 +6134,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
6100
 
6101
  int id;
6102
  CUDA_CHECK(cudaGetDevice(&id));
6103
- const int compute_capability = g_compute_capabilities[id];
6104
 
6105
  int mmq_x, mmq_y, nwarps;
6106
  if (compute_capability >= CC_RDNA2) {
@@ -6147,7 +6181,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
6147
 
6148
  int id;
6149
  CUDA_CHECK(cudaGetDevice(&id));
6150
- const int compute_capability = g_compute_capabilities[id];
6151
 
6152
  int mmq_x, mmq_y, nwarps;
6153
  if (compute_capability >= CC_RDNA2) {
@@ -6193,7 +6227,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
6193
 
6194
  int id;
6195
  CUDA_CHECK(cudaGetDevice(&id));
6196
- const int compute_capability = g_compute_capabilities[id];
6197
 
6198
  int mmq_x, mmq_y, nwarps;
6199
  if (compute_capability >= CC_RDNA2) {
@@ -6238,7 +6272,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
6238
 
6239
  int id;
6240
  CUDA_CHECK(cudaGetDevice(&id));
6241
- const int compute_capability = g_compute_capabilities[id];
6242
 
6243
  int mmq_x, mmq_y, nwarps;
6244
  if (compute_capability >= CC_RDNA2) {
@@ -6283,7 +6317,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
6283
 
6284
  int id;
6285
  CUDA_CHECK(cudaGetDevice(&id));
6286
- const int compute_capability = g_compute_capabilities[id];
6287
 
6288
  int mmq_x, mmq_y, nwarps;
6289
  if (compute_capability >= CC_RDNA2) {
@@ -6543,30 +6577,30 @@ struct scoped_spin_lock {
6543
  scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
6544
  };
6545
 
6546
- struct cuda_buffer {
 
 
 
6547
  void * ptr = nullptr;
6548
  size_t size = 0;
6549
  };
6550
 
6551
- static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
6552
- static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
6553
 
6554
- static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
6555
  scoped_spin_lock lock(g_cuda_pool_lock);
6556
- int id;
6557
- CUDA_CHECK(cudaGetDevice(&id));
6558
  #ifdef DEBUG_CUDA_MALLOC
6559
  int nnz = 0;
6560
- size_t max_size = 0, tot_size = 0;
6561
  #endif
6562
  size_t best_diff = 1ull << 36;
6563
  int ibest = -1;
6564
  for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
6565
- cuda_buffer& b = g_cuda_buffer_pool[id][i];
6566
  if (b.ptr != nullptr) {
6567
  #ifdef DEBUG_CUDA_MALLOC
6568
  ++nnz;
6569
- tot_size += b.size;
6570
  if (b.size > max_size) max_size = b.size;
6571
  #endif
6572
  if (b.size >= size) {
@@ -6586,32 +6620,32 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
6586
  }
6587
  }
6588
  if (ibest >= 0) {
6589
- cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
6590
  void * ptr = b.ptr;
6591
  *actual_size = b.size;
6592
  b.ptr = nullptr;
6593
  b.size = 0;
6594
  return ptr;
6595
  }
6596
- #ifdef DEBUG_CUDA_MALLOC
6597
- fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
6598
- (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
6599
- #endif
6600
  void * ptr;
6601
  size_t look_ahead_size = (size_t) (1.05 * size);
6602
  look_ahead_size = 256 * ((look_ahead_size + 255)/256);
 
6603
  CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
6604
  *actual_size = look_ahead_size;
 
 
 
 
 
6605
  return ptr;
6606
  }
6607
 
6608
- static void ggml_cuda_pool_free(void * ptr, size_t size) {
6609
  scoped_spin_lock lock(g_cuda_pool_lock);
6610
- int id;
6611
- CUDA_CHECK(cudaGetDevice(&id));
6612
 
6613
  for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
6614
- cuda_buffer& b = g_cuda_buffer_pool[id][i];
6615
  if (b.ptr == nullptr) {
6616
  b.ptr = ptr;
6617
  b.size = size;
@@ -6619,9 +6653,149 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
6619
  }
6620
  }
6621
  fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
 
6622
  CUDA_CHECK(cudaFree(ptr));
 
6623
  }
6624
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6625
  static bool g_cublas_loaded = false;
6626
 
6627
  bool ggml_cublas_loaded(void) {
@@ -6660,16 +6834,33 @@ void ggml_init_cublas() {
6660
  #endif
6661
  fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
6662
  for (int id = 0; id < g_device_count; ++id) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6663
  cudaDeviceProp prop;
6664
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
6665
- fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
6666
 
6667
  g_tensor_split[id] = total_vram;
6668
  total_vram += prop.totalGlobalMem;
6669
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
6670
- g_compute_capabilities[id] = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
6671
  #else
6672
- g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
6673
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
6674
  }
6675
  for (int id = 0; id < g_device_count; ++id) {
@@ -6677,7 +6868,7 @@ void ggml_init_cublas() {
6677
  }
6678
 
6679
  for (int id = 0; id < g_device_count; ++id) {
6680
- CUDA_CHECK(ggml_cuda_set_device(id));
6681
 
6682
  // create cuda streams
6683
  for (int is = 0; is < MAX_STREAMS; ++is) {
@@ -6729,8 +6920,7 @@ void * ggml_cuda_host_malloc(size_t size) {
6729
  void * ptr = nullptr;
6730
  cudaError_t err = cudaMallocHost((void **) &ptr, size);
6731
  if (err != cudaSuccess) {
6732
- // The allocation error can be bypassed. A null ptr will assigned out of this function.
6733
- // This can fixed the OOM error in WSL.
6734
  cudaGetLastError();
6735
  fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
6736
  size/1024.0/1024.0, cudaGetErrorString(err));
@@ -6793,7 +6983,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
6793
 
6794
  static void ggml_cuda_op_get_rows(
6795
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6796
- const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
6797
 
6798
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
6799
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
@@ -6835,9 +7025,9 @@ static void ggml_cuda_op_get_rows(
6835
  }
6836
 
6837
  template<class op>
6838
- inline void ggml_cuda_op_bin_bcast(
6839
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6840
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6841
 
6842
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
6843
 
@@ -6856,7 +7046,7 @@ inline void ggml_cuda_op_bin_bcast(
6856
 
6857
  static void ggml_cuda_op_repeat(
6858
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6859
- const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & main_stream) {
6860
 
6861
  ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
6862
 
@@ -6864,16 +7054,16 @@ static void ggml_cuda_op_repeat(
6864
  (void) src1_d;
6865
  }
6866
 
6867
- inline void ggml_cuda_op_add(
6868
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6869
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6870
 
6871
  ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6872
  }
6873
 
6874
- inline void ggml_cuda_op_acc(
6875
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6876
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6877
 
6878
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6879
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -6890,23 +7080,23 @@ inline void ggml_cuda_op_acc(
6890
  (void) dst;
6891
  }
6892
 
6893
- inline void ggml_cuda_op_mul(
6894
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6895
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6896
 
6897
  ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6898
  }
6899
 
6900
- inline void ggml_cuda_op_div(
6901
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6902
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6903
 
6904
  ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
6905
  }
6906
 
6907
- inline void ggml_cuda_op_gelu(
6908
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6909
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6910
 
6911
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6912
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -6918,9 +7108,9 @@ inline void ggml_cuda_op_gelu(
6918
  (void) src1_dd;
6919
  }
6920
 
6921
- inline void ggml_cuda_op_silu(
6922
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6923
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6924
 
6925
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6926
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -6932,9 +7122,9 @@ inline void ggml_cuda_op_silu(
6932
  (void) src1_dd;
6933
  }
6934
 
6935
- inline void ggml_cuda_op_gelu_quick(
6936
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6937
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6938
 
6939
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6940
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -6946,9 +7136,9 @@ inline void ggml_cuda_op_gelu_quick(
6946
  (void) src1_dd;
6947
  }
6948
 
6949
- inline void ggml_cuda_op_tanh(
6950
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6951
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6952
 
6953
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6954
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -6960,9 +7150,9 @@ inline void ggml_cuda_op_tanh(
6960
  (void) src1_dd;
6961
  }
6962
 
6963
- inline void ggml_cuda_op_relu(
6964
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6965
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6966
 
6967
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6968
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -6974,9 +7164,9 @@ inline void ggml_cuda_op_relu(
6974
  (void) src1_dd;
6975
  }
6976
 
6977
- inline void ggml_cuda_op_leaky_relu(
6978
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6979
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6980
 
6981
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6982
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -6991,9 +7181,9 @@ inline void ggml_cuda_op_leaky_relu(
6991
  (void) src1_dd;
6992
  }
6993
 
6994
- inline void ggml_cuda_op_sqr(
6995
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6996
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
6997
 
6998
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
6999
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -7005,9 +7195,9 @@ inline void ggml_cuda_op_sqr(
7005
  (void) src1_dd;
7006
  }
7007
 
7008
- inline void ggml_cuda_op_norm(
7009
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7010
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7011
 
7012
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7013
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -7025,10 +7215,9 @@ inline void ggml_cuda_op_norm(
7025
  (void) src1_dd;
7026
  }
7027
 
7028
-
7029
- inline void ggml_cuda_op_group_norm(
7030
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7031
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7032
 
7033
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7034
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -7042,9 +7231,9 @@ inline void ggml_cuda_op_group_norm(
7042
  (void) src1_dd;
7043
  }
7044
 
7045
- inline void ggml_cuda_op_concat(
7046
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7047
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7048
 
7049
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7050
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -7058,9 +7247,9 @@ inline void ggml_cuda_op_concat(
7058
  (void) dst;
7059
  }
7060
 
7061
- inline void ggml_cuda_op_upscale(
7062
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7063
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7064
 
7065
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7066
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
@@ -7075,9 +7264,9 @@ inline void ggml_cuda_op_upscale(
7075
  (void) src1_dd;
7076
  }
7077
 
7078
- inline void ggml_cuda_op_pad(
7079
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7080
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7081
 
7082
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7083
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
@@ -7092,9 +7281,9 @@ inline void ggml_cuda_op_pad(
7092
  (void) src1_dd;
7093
  }
7094
 
7095
- inline void ggml_cuda_op_rms_norm(
7096
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7097
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7098
 
7099
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7100
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -7112,10 +7301,10 @@ inline void ggml_cuda_op_rms_norm(
7112
  (void) src1_dd;
7113
  }
7114
 
7115
- inline void ggml_cuda_op_mul_mat_q(
7116
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
7117
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
7118
- const int64_t src1_padded_row_size, const cudaStream_t & stream) {
7119
 
7120
  const int64_t ne00 = src0->ne[0];
7121
 
@@ -7177,13 +7366,13 @@ inline void ggml_cuda_op_mul_mat_q(
7177
  static int64_t get_row_rounding(ggml_type type) {
7178
  int64_t min_compute_capability = INT_MAX;
7179
  int64_t max_compute_capability = INT_MIN;
7180
- for (int64_t id = 0; id < g_device_count; ++id) {
7181
  if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
7182
- if (min_compute_capability > g_compute_capabilities[id]) {
7183
- min_compute_capability = g_compute_capabilities[id];
7184
  }
7185
- if (max_compute_capability < g_compute_capabilities[id]) {
7186
- max_compute_capability = g_compute_capabilities[id];
7187
  }
7188
  }
7189
  }
@@ -7235,10 +7424,10 @@ static int64_t get_row_rounding(ggml_type type) {
7235
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
7236
  }
7237
 
7238
- inline void ggml_cuda_op_mul_mat_vec_q(
7239
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
7240
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
7241
- const int64_t src1_padded_row_size, const cudaStream_t & stream) {
7242
 
7243
  GGML_ASSERT(ggml_nrows(src1) == 1);
7244
 
@@ -7288,18 +7477,20 @@ inline void ggml_cuda_op_mul_mat_vec_q(
7288
  (void) src1_padded_row_size;
7289
  }
7290
 
7291
- inline void ggml_cuda_op_dequantize_mul_mat_vec(
7292
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
7293
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
7294
- const int64_t src1_padded_row_size, const cudaStream_t & stream) {
7295
 
7296
  const int64_t ne00 = src0->ne[0];
7297
  const int64_t row_diff = row_high - row_low;
7298
 
 
 
7299
  // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
7300
  #ifdef GGML_CUDA_F16
7301
- size_t ash;
7302
- dfloat * src1_dfloat = nullptr; // dfloat == half
7303
 
7304
  bool src1_convert_f16 =
7305
  src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
@@ -7307,7 +7498,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
7307
  src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
7308
 
7309
  if (src1_convert_f16) {
7310
- src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
7311
  ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
7312
  ne00, 1, sizeof(float), 0, 0,
7313
  ne00, 1, sizeof(half), 0, 0, stream);
@@ -7355,12 +7546,6 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
7355
  break;
7356
  }
7357
 
7358
- #ifdef GGML_CUDA_F16
7359
- if (src1_convert_f16) {
7360
- ggml_cuda_pool_free(src1_dfloat, ash);
7361
- }
7362
- #endif // GGML_CUDA_F16
7363
-
7364
  (void) src1;
7365
  (void) dst;
7366
  (void) src1_ddq_i;
@@ -7368,10 +7553,10 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
7368
  (void) src1_padded_row_size;
7369
  }
7370
 
7371
- inline void ggml_cuda_op_mul_mat_cublas(
7372
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
7373
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
7374
- const int64_t src1_padded_row_size, const cudaStream_t & stream) {
7375
 
7376
  GGML_ASSERT(src0_dd_i != nullptr);
7377
  GGML_ASSERT(src1_ddf_i != nullptr);
@@ -7391,33 +7576,31 @@ inline void ggml_cuda_op_mul_mat_cublas(
7391
  // ldc == nrows of the matrix that cuBLAS writes into
7392
  int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
7393
 
7394
- const int compute_capability = g_compute_capabilities[id];
7395
 
7396
  if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
 
7397
  // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
7398
- half * src0_as_f16 = nullptr;
7399
- size_t src0_as = 0;
7400
  if (src0->type != GGML_TYPE_F16) {
7401
  const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
7402
  GGML_ASSERT(to_fp16_cuda != nullptr);
7403
  size_t ne = row_diff*ne00;
7404
- src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as);
7405
- to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
7406
  }
7407
- const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
7408
 
7409
- half * src1_as_f16 = nullptr;
7410
- size_t src1_as = 0;
7411
  if (src1->type != GGML_TYPE_F16) {
7412
  const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
7413
  GGML_ASSERT(to_fp16_cuda != nullptr);
7414
  size_t ne = src1_ncols*ne10;
7415
- src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
7416
- to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
7417
  }
7418
- const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
7419
- size_t dst_as = 0;
7420
- half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
7421
 
7422
  const half alpha_f16 = 1.0f;
7423
  const half beta_f16 = 0.0f;
@@ -7426,36 +7609,33 @@ inline void ggml_cuda_op_mul_mat_cublas(
7426
  CUBLAS_CHECK(
7427
  cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7428
  row_diff, src1_ncols, ne10,
7429
- &alpha_f16, src0_ptr, CUDA_R_16F, ne00,
7430
- src1_ptr, CUDA_R_16F, ne10,
7431
- &beta_f16, dst_f16, CUDA_R_16F, ldc,
7432
  CUBLAS_COMPUTE_16F,
7433
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7434
 
7435
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
7436
- to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
7437
-
7438
- ggml_cuda_pool_free(dst_f16, dst_as);
7439
-
7440
- if (src0_as != 0) {
7441
- ggml_cuda_pool_free(src0_as_f16, src0_as);
7442
- }
7443
-
7444
- if (src1_as != 0) {
7445
- ggml_cuda_pool_free(src1_as_f16, src1_as);
7446
- }
7447
- }
7448
- else {
7449
- float * src0_ddq_as_f32 = nullptr;
7450
- size_t src0_as = 0;
7451
 
7452
  if (src0->type != GGML_TYPE_F32) {
7453
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
7454
  GGML_ASSERT(to_fp32_cuda != nullptr);
7455
- src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
7456
- to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
 
 
 
 
 
 
7457
  }
7458
- const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
 
 
7459
 
7460
  const float alpha = 1.0f;
7461
  const float beta = 0.0f;
@@ -7464,13 +7644,9 @@ inline void ggml_cuda_op_mul_mat_cublas(
7464
  CUBLAS_CHECK(
7465
  cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7466
  row_diff, src1_ncols, ne10,
7467
- &alpha, src0_ddf_i, ne00,
7468
- src1_ddf_i, ne10,
7469
- &beta, dst_dd_i, ldc));
7470
-
7471
- if (src0_as != 0) {
7472
- ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
7473
- }
7474
  }
7475
 
7476
  (void) dst;
@@ -7478,9 +7654,9 @@ inline void ggml_cuda_op_mul_mat_cublas(
7478
  (void) src1_padded_row_size;
7479
  }
7480
 
7481
- inline void ggml_cuda_op_rope(
7482
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7483
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7484
 
7485
  GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
7486
  GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
@@ -7558,9 +7734,9 @@ inline void ggml_cuda_op_rope(
7558
  (void) src1_dd;
7559
  }
7560
 
7561
- inline void ggml_cuda_op_alibi(
7562
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7563
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7564
 
7565
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7566
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -7589,9 +7765,9 @@ inline void ggml_cuda_op_alibi(
7589
  (void) src1_dd;
7590
  }
7591
 
7592
- inline void ggml_cuda_op_im2col(
7593
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7594
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7595
 
7596
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
7597
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
@@ -7624,10 +7800,9 @@ inline void ggml_cuda_op_im2col(
7624
  (void) src0_dd;
7625
  }
7626
 
7627
-
7628
- inline void ggml_cuda_op_sum_rows(
7629
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7630
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7631
 
7632
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7633
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -7642,9 +7817,9 @@ inline void ggml_cuda_op_sum_rows(
7642
  (void) src1_dd;
7643
  }
7644
 
7645
- inline void ggml_cuda_op_argsort(
7646
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7647
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7648
 
7649
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7650
  GGML_ASSERT( dst->type == GGML_TYPE_I32);
@@ -7661,9 +7836,9 @@ inline void ggml_cuda_op_argsort(
7661
  (void) src1_dd;
7662
  }
7663
 
7664
- inline void ggml_cuda_op_diag_mask_inf(
7665
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7666
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7667
 
7668
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7669
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -7681,9 +7856,9 @@ inline void ggml_cuda_op_diag_mask_inf(
7681
  (void) src1_dd;
7682
  }
7683
 
7684
- inline void ggml_cuda_op_soft_max(
7685
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7686
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7687
 
7688
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7689
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -7702,9 +7877,9 @@ inline void ggml_cuda_op_soft_max(
7702
  (void) dst;
7703
  }
7704
 
7705
- inline void ggml_cuda_op_scale(
7706
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7707
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7708
 
7709
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7710
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -7720,9 +7895,9 @@ inline void ggml_cuda_op_scale(
7720
  (void) src1_dd;
7721
  }
7722
 
7723
- inline void ggml_cuda_op_clamp(
7724
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7725
- const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
7726
 
7727
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7728
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -7762,18 +7937,17 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
7762
  float * src1_ddf = nullptr;
7763
  float * dst_ddf = nullptr;
7764
 
7765
- // as = actual size
7766
- size_t src0_asf = 0;
7767
- size_t src1_asf = 0;
7768
- size_t dst_asf = 0;
7769
 
7770
  ggml_cuda_set_device(g_main_device);
7771
- const cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
7772
 
7773
  if (src0_on_device) {
7774
  src0_ddf = (float *) src0_extra->data_device[g_main_device];
7775
  } else {
7776
- src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf);
7777
  CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
7778
  }
7779
 
@@ -7781,14 +7955,14 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
7781
  if (src1_on_device) {
7782
  src1_ddf = (float *) src1_extra->data_device[g_main_device];
7783
  } else {
7784
- src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf);
7785
  CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
7786
  }
7787
  }
7788
  if (dst_on_device) {
7789
  dst_ddf = (float *) dst_extra->data_device[g_main_device];
7790
  } else {
7791
- dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf);
7792
  }
7793
 
7794
  // do the computation
@@ -7800,16 +7974,6 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
7800
  CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
7801
  }
7802
 
7803
- if (src0_asf > 0) {
7804
- ggml_cuda_pool_free(src0_ddf, src0_asf);
7805
- }
7806
- if (src1_asf > 0) {
7807
- ggml_cuda_pool_free(src1_ddf, src1_asf);
7808
- }
7809
- if (dst_asf > 0) {
7810
- ggml_cuda_pool_free(dst_ddf, dst_asf);
7811
- }
7812
-
7813
  if (dst->backend == GGML_BACKEND_CPU) {
7814
  CUDA_CHECK(cudaDeviceSynchronize());
7815
  }
@@ -7826,12 +7990,12 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
7826
 
7827
  #ifdef NDEBUG
7828
  for (int id = 0; id < g_device_count; ++id) {
7829
- CUDA_CHECK(ggml_cuda_set_device(id));
7830
  CUDA_CHECK(cudaDeviceSynchronize());
7831
  }
7832
 
7833
  for (int id = 0; id < g_device_count; ++id) {
7834
- CUDA_CHECK(ggml_cuda_set_device(id));
7835
 
7836
  for (int id_other = 0; id_other < g_device_count; ++id_other) {
7837
  if (id == id_other) {
@@ -7865,7 +8029,6 @@ static void ggml_cuda_op_mul_mat(
7865
  const int64_t ne01 = src0->ne[1];
7866
  const int64_t ne02 = src0->ne[2];
7867
  const int64_t ne03 = src0->ne[3];
7868
- const int64_t nrows0 = ggml_nrows(src0);
7869
 
7870
  const int64_t ne10 = src1->ne[0];
7871
  const int64_t ne11 = src1->ne[1];
@@ -7883,6 +8046,7 @@ static void ggml_cuda_op_mul_mat(
7883
 
7884
  GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
7885
  GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
 
7886
 
7887
  GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
7888
 
@@ -7908,27 +8072,29 @@ static void ggml_cuda_op_mul_mat(
7908
  GGML_ASSERT(!(split && ne03 > 1));
7909
  GGML_ASSERT(!(split && ne02 < ne12));
7910
 
7911
- // dd = data device
7912
- char * src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
7913
- float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
7914
- char * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1
7915
- float * dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
 
 
 
 
 
7916
 
7917
- // as = actual size
7918
- size_t src0_as[GGML_CUDA_MAX_DEVICES] = {0};
7919
- size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
7920
- size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0};
7921
- size_t dst_as[GGML_CUDA_MAX_DEVICES] = {0};
7922
 
7923
- int64_t row_low[GGML_CUDA_MAX_DEVICES];
7924
- int64_t row_high[GGML_CUDA_MAX_DEVICES];
7925
 
7926
  int used_devices = 0;
7927
 
7928
- for (int64_t id = 0; id < g_device_count; ++id) {
7929
  // by default, use all rows
7930
- row_low[id] = 0;
7931
- row_high[id] = ne01;
7932
 
7933
  // for multi GPU, get the row boundaries from tensor split
7934
  // and round to mul_mat_q tile sizes
@@ -7936,19 +8102,23 @@ static void ggml_cuda_op_mul_mat(
7936
  const int64_t rounding = get_row_rounding(src0->type);
7937
 
7938
  if (id != 0) {
7939
- row_low[id] = ne01*g_tensor_split[id];
7940
- row_low[id] -= row_low[id] % rounding;
 
 
7941
  }
7942
 
7943
  if (id != g_device_count - 1) {
7944
- row_high[id] = ne01*g_tensor_split[id + 1];
7945
- row_high[id] -= row_high[id] % rounding;
 
 
7946
  }
7947
  }
7948
  }
7949
 
7950
- for (int64_t id = 0; id < g_device_count; ++id) {
7951
- if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
7952
  continue;
7953
  }
7954
 
@@ -7958,42 +8128,41 @@ static void ggml_cuda_op_mul_mat(
7958
  const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
7959
 
7960
  ggml_cuda_set_device(id);
7961
- const cudaStream_t stream = g_cudaStreams[id][0];
7962
 
7963
  if (src0_on_device && src0_is_contiguous) {
7964
- src0_dd[id] = (char *) src0_extra->data_device[id];
7965
  } else {
7966
- // const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
7967
- src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
7968
  }
7969
 
7970
  if (src1_on_device && src1_is_contiguous) {
7971
- src1_ddf[id] = (float *) src1_extra->data_device[id];
7972
  } else {
7973
- src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
7974
  }
7975
 
7976
  if (convert_src1_to_q8_1) {
7977
- src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
7978
 
7979
  if (src1_on_device && src1_is_contiguous) {
7980
- quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
7981
  CUDA_CHECK(cudaGetLastError());
7982
  }
7983
  }
7984
 
7985
  if (dst_on_device) {
7986
- dst_dd[id] = (float *) dst_extra->data_device[id];
7987
  } else {
7988
- const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
7989
- dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
7990
  }
7991
  }
7992
 
7993
  // if multiple devices are used they need to wait for the main device
7994
  // here an event is recorded that signals that the main device has finished calculating the input data
7995
  if (split && used_devices > 1) {
7996
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
7997
  CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
7998
  }
7999
 
@@ -8002,17 +8171,17 @@ static void ggml_cuda_op_mul_mat(
8002
  const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
8003
  const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
8004
 
8005
- for (int64_t id = 0; id < g_device_count; ++id) {
8006
- if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
8007
  continue;
8008
  }
8009
 
8010
  const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
8011
  const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
8012
- const int64_t row_diff = row_high[id] - row_low[id];
8013
 
8014
  ggml_cuda_set_device(id);
8015
- const cudaStream_t stream = g_cudaStreams[id][is];
8016
 
8017
  // wait for main GPU data if necessary
8018
  if (split && (id != g_main_device || is != 0)) {
@@ -8026,34 +8195,34 @@ static void ggml_cuda_op_mul_mat(
8026
  const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
8027
 
8028
  // for split tensors the data begins at i0 == i0_offset_low
8029
- char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
8030
- float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
8031
- char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset;
8032
- float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
8033
 
8034
  // the main device memory buffer can be on VRAM scratch, with space for all partial results
8035
  // in that case an offset on dst_ddf_i is needed
8036
  if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
8037
- dst_dd_i += row_low[id]; // offset is 0 if no tensor split
8038
  }
8039
 
8040
  // copy src0, src1 to device if necessary
8041
  if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
8042
  if (id != g_main_device) {
8043
  if (convert_src1_to_q8_1) {
8044
- char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
8045
- CUDA_CHECK(cudaMemcpyAsync(src1_ddq_i, src1_ddq_i_source, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs,
8046
- cudaMemcpyDeviceToDevice, stream));
8047
  } else {
8048
  float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
8049
  src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
8050
- CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_ncols*ne10*sizeof(float),
8051
- cudaMemcpyDeviceToDevice, stream));
8052
  }
8053
  }
8054
  } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
8055
  CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
8056
- src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
8057
  } else {
8058
  GGML_ASSERT(false);
8059
  }
@@ -8064,12 +8233,12 @@ static void ggml_cuda_op_mul_mat(
8064
  }
8065
 
8066
  if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
8067
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
8068
  }
8069
 
8070
  // do the computation
8071
  op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
8072
- row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream);
8073
  CUDA_CHECK(cudaGetLastError());
8074
 
8075
  // copy dst to host or other device if necessary
@@ -8093,9 +8262,25 @@ static void ggml_cuda_op_mul_mat(
8093
  // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
8094
  float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
8095
  GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
8096
- dhf_dst_i += src1_col_0*ne0 + row_low[id];
8097
- CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_dd_i, row_diff*sizeof(float),
8098
- row_diff*sizeof(float), src1_ncols, kind, stream));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8099
  } else {
8100
  float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
8101
  GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
@@ -8112,35 +8297,14 @@ static void ggml_cuda_op_mul_mat(
8112
  }
8113
  }
8114
 
8115
- for (int64_t id = 0; id < g_device_count; ++id) {
8116
- if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
8117
- continue;
8118
- }
8119
- CUDA_CHECK(ggml_cuda_set_device(id));
8120
-
8121
- // free buffers again when done
8122
- if (src0_as[id] > 0) {
8123
- ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
8124
- }
8125
- if (src1_asf[id] > 0) {
8126
- ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
8127
- }
8128
- if (src1_asq[id] > 0) {
8129
- ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
8130
- }
8131
- if (dst_as[id] > 0) {
8132
- ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
8133
- }
8134
- }
8135
-
8136
  // main device waits for all other devices to be finished
8137
  if (split && g_device_count > 1) {
8138
  int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
8139
  is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
8140
 
8141
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
8142
- for (int64_t id = 0; id < g_device_count; ++id) {
8143
- if (row_low[id] == row_high[id]) {
8144
  continue;
8145
  }
8146
  for (int64_t is = 0; is < is_max; ++is) {
@@ -8150,7 +8314,7 @@ static void ggml_cuda_op_mul_mat(
8150
  }
8151
 
8152
  if (dst->backend == GGML_BACKEND_CPU) {
8153
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
8154
  CUDA_CHECK(cudaDeviceSynchronize());
8155
  }
8156
  }
@@ -8260,7 +8424,7 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
8260
 
8261
  const int64_t ne12 = src1->ne[2];
8262
 
8263
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
8264
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
8265
 
8266
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
@@ -8292,7 +8456,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
8292
 
8293
  const int64_t ne12 = src1->ne[2];
8294
 
8295
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
8296
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
8297
 
8298
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
@@ -8329,9 +8493,9 @@ static __global__ void k_compute_batched_ptrs(
8329
  int64_t i03 = i13 / r3;
8330
  int64_t i02 = i12 / r2;
8331
 
8332
- ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
8333
- ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
8334
- ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
8335
  }
8336
 
8337
  static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -8340,37 +8504,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8340
 
8341
  GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
8342
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
8343
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
8344
-
8345
- const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
8346
- const int64_t ne01 = src0->ne[1];
8347
- const int64_t ne02 = src0->ne[2];
8348
- const int64_t ne03 = src0->ne[3];
8349
-
8350
- const int64_t nb01 = src0->nb[1];
8351
- const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
8352
- const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
8353
-
8354
- const int64_t ne10 = src1->ne[0];
8355
- const int64_t ne11 = src1->ne[1];
8356
- const int64_t ne12 = src1->ne[2];
8357
- const int64_t ne13 = src1->ne[3];
8358
 
8359
- const int64_t nb11 = src1->nb[1];
8360
- const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
8361
- const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
8362
 
8363
- const int64_t ne1 = ggml_nelements(src1);
8364
- const int64_t ne = ggml_nelements(dst);
8365
 
8366
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
8367
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
8368
 
8369
  CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
8370
 
8371
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
8372
  void * src0_ddq = src0_extra->data_device[g_main_device];
8373
- half * src0_as_f16 = (half *) src0_ddq;
8374
 
8375
  ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
8376
  float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
@@ -8379,17 +8525,18 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8379
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
8380
 
8381
  // convert src1 to fp16
8382
- const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
8383
- GGML_ASSERT(to_fp16_cuda != nullptr);
8384
-
8385
- size_t src1_as = 0;
8386
- half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
8387
- to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
8388
-
8389
- size_t dst_as = 0;
 
8390
 
8391
- half * dst_f16 = nullptr;
8392
- char * dst_t = nullptr;
8393
 
8394
  cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
8395
  cudaDataType_t cu_data_type = CUDA_R_16F;
@@ -8408,8 +8555,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8408
  const void * beta = &beta_f16;
8409
 
8410
  if (dst->op_params[0] == GGML_PREC_DEFAULT) {
8411
- dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
8412
- dst_t = (char *) dst_f16;
8413
 
8414
  nbd2 /= sizeof(float) / sizeof(half);
8415
  nbd3 /= sizeof(float) / sizeof(half);
@@ -8456,9 +8602,9 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8456
  CUBLAS_CHECK(
8457
  cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8458
  ne01, ne11, ne10,
8459
- alpha, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
8460
- (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
8461
- beta, ( char *) dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), // strideC
8462
  ne12*ne13,
8463
  cu_compute_type,
8464
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
@@ -8466,23 +8612,18 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8466
  // use cublasGemmBatchedEx
8467
  const int ne23 = ne12*ne13;
8468
 
8469
- const void ** ptrs_src = nullptr;
8470
- void ** ptrs_dst = nullptr;
8471
-
8472
- size_t ptrs_src_s = 0;
8473
- size_t ptrs_dst_s = 0;
8474
-
8475
- ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
8476
- ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
8477
 
8478
  dim3 block_dims(ne13, ne12);
8479
  k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
8480
- src0_as_f16, src1_as_f16, dst_t,
8481
- ptrs_src, ptrs_dst,
8482
  ne12, ne13,
8483
  ne23,
8484
  nb02, nb03,
8485
- nb12, nb13,
 
8486
  nbd2, nbd3,
8487
  r2, r3);
8488
  CUDA_CHECK(cudaGetLastError());
@@ -8490,30 +8631,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
8490
  CUBLAS_CHECK(
8491
  cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8492
  ne01, ne11, ne10,
8493
- alpha, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
8494
- (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
8495
- beta, ( void **) (ptrs_dst + 0*ne23), cu_data_type, ne01,
8496
  ne23,
8497
  cu_compute_type,
8498
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
8499
-
8500
- if (ptrs_src_s != 0) {
8501
- ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
8502
- }
8503
- if (ptrs_dst_s != 0) {
8504
- ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
8505
- }
8506
  }
8507
  #endif
8508
 
8509
  if (dst->op_params[0] == GGML_PREC_DEFAULT) {
8510
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
8511
- to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
8512
-
8513
- ggml_cuda_pool_free(dst_f16, dst_as);
8514
  }
8515
-
8516
- ggml_cuda_pool_free(src1_as_f16, src1_as);
8517
  }
8518
 
8519
  static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -8525,9 +8655,9 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
8525
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
8526
 
8527
  int64_t min_compute_capability = INT_MAX;
8528
- for (int64_t id = 0; id < g_device_count; ++id) {
8529
- if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
8530
- min_compute_capability = g_compute_capabilities[id];
8531
  }
8532
  }
8533
 
@@ -8551,13 +8681,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
8551
  } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
8552
  // KQV single-batch
8553
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
8554
- } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
8555
  // KQ + KQV multi-batch
8556
  ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
8557
  } else if (src0->type == GGML_TYPE_F32) {
8558
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
8559
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
8560
- if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
8561
  #ifdef GGML_CUDA_FORCE_DMMV
8562
  const bool use_mul_mat_vec_q = false;
8563
  #else
@@ -8668,7 +8798,7 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
8668
  const int64_t ne1 = ggml_nelements(src1);
8669
  const int64_t ne = ggml_nelements(dst);
8670
 
8671
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
8672
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
8673
 
8674
  CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
@@ -8786,7 +8916,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8786
 
8787
  std::vector<char> ids_host(ggml_nbytes(ids));
8788
 
8789
- const cudaStream_t stream = g_cudaStreams[g_main_device][0];
8790
 
8791
  if (ids->backend == GGML_BACKEND_GPU) {
8792
  const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
@@ -8840,12 +8970,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8840
  ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
8841
  }
8842
  } else {
8843
- size_t as_src1, as_dst;
8844
- char * src1_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(src1), &as_src1);
8845
- char * dst_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(dst), &as_dst);
8846
 
8847
- src1_row_extra.data_device[g_main_device] = src1_contiguous;
8848
- dst_row_extra.data_device[g_main_device] = dst_contiguous;
8849
 
8850
  const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
8851
  cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
@@ -8865,7 +8994,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8865
 
8866
  GGML_ASSERT(row_id >= 0 && row_id < n_as);
8867
 
8868
- CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
8869
  nb11, src1_kind, stream));
8870
  num_src1_rows++;
8871
  }
@@ -8897,14 +9026,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8897
 
8898
  GGML_ASSERT(row_id >= 0 && row_id < n_as);
8899
 
8900
- CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
8901
  nb1, dst_kind, stream));
8902
  num_src1_rows++;
8903
  }
8904
  }
8905
-
8906
- ggml_cuda_pool_free(src1_contiguous, as_src1);
8907
- ggml_cuda_pool_free(dst_contiguous, as_dst);
8908
  }
8909
 
8910
  if (dst->backend == GGML_BACKEND_CPU) {
@@ -8946,7 +9072,7 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
8946
  const int64_t nb11 = src1->nb[1];
8947
  const int64_t nb12 = src1->nb[2];
8948
 
8949
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
8950
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
8951
 
8952
  const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
@@ -9036,7 +9162,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
9036
  ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
9037
  memset(extra, 0, sizeof(*extra));
9038
 
9039
- for (int64_t id = 0; id < g_device_count; ++id) {
9040
  if (backend == GGML_BACKEND_GPU && id != g_main_device) {
9041
  continue;
9042
  }
@@ -9107,15 +9233,14 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
9107
 
9108
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
9109
 
9110
- for (int64_t id = 0; id < g_device_count; ++id) {
 
9111
  if (extra->data_device[id] != nullptr) {
9112
- CUDA_CHECK(ggml_cuda_set_device(id));
9113
  CUDA_CHECK(cudaFree(extra->data_device[id]));
9114
  }
9115
 
9116
  for (int64_t is = 0; is < MAX_STREAMS; ++is) {
9117
  if (extra->events[id][is] != nullptr) {
9118
- CUDA_CHECK(ggml_cuda_set_device(id));
9119
  CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
9120
  }
9121
  }
@@ -9169,7 +9294,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
9169
  force_inplace;
9170
  const size_t size = ggml_nbytes(tensor);
9171
 
9172
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
9173
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
9174
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
9175
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
@@ -9246,7 +9371,7 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
9246
  GGML_ASSERT(ggml_is_contiguous(tensor));
9247
 
9248
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
9249
- CUDA_CHECK(ggml_cuda_set_device(g_main_device));
9250
  CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
9251
  }
9252
 
@@ -9670,12 +9795,16 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
9670
  // host buffer type
9671
 
9672
  static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
9673
- CUDA_CHECK(cudaFreeHost(buffer->context));
9674
  }
9675
 
9676
  static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
9677
- void * ptr;
9678
- CUDA_CHECK(cudaMallocHost(&ptr, size));
 
 
 
 
9679
 
9680
  // FIXME: this is a hack to avoid having to implement a new buffer type
9681
  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
 
68
  #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
69
  #endif
70
  #define cudaMemcpy hipMemcpy
 
71
  #define cudaMemcpyAsync hipMemcpyAsync
72
+ #define cudaMemcpyPeerAsync hipMemcpyPeerAsync
73
+ #define cudaMemcpy2DAsync hipMemcpy2DAsync
74
  #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
75
  #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
76
  #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
 
87
  #define cudaStream_t hipStream_t
88
  #define cudaSuccess hipSuccess
89
  #define __trap abort
90
+ #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
91
+ #define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
92
+ #define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
93
+ #define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
94
+ #define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
95
+ #define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
96
+ #define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
97
+ #define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
98
+ #define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
99
  #else
100
  #include <cuda_runtime.h>
101
+ #include <cuda.h>
102
  #include <cublas_v2.h>
103
  #include <cuda_fp16.h>
104
+
105
+ #if CUDART_VERSION < 11020
106
+ #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
107
  #define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
108
  #define CUBLAS_COMPUTE_16F CUDA_R_16F
109
  #define CUBLAS_COMPUTE_32F CUDA_R_32F
110
  #define cublasComputeType_t cudaDataType_t
111
+ #endif // CUDART_VERSION < 11020
112
+
113
  #endif // defined(GGML_USE_HIPBLAS)
114
 
115
  #include "ggml-cuda.h"
 
164
  const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
165
  #if __has_builtin(__builtin_elementwise_sub_sat)
166
  const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
167
+ return reinterpret_cast<const int &>(c);
168
  #else
169
  int8x4_t c;
170
  int16_t tmp;
 
175
  if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
176
  c[i] = tmp;
177
  }
178
+ return reinterpret_cast<int &>(c);
179
  #endif // __has_builtin(__builtin_elementwise_sub_sat)
180
  }
181
 
 
213
 
214
  static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
215
 
216
+ [[noreturn]]
217
+ static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
218
+ int id = -1; // in case cudaGetDevice fails
219
+ cudaGetDevice(&id);
220
+
221
+ fprintf(stderr, "CUDA error: %s\n", msg);
222
+ fprintf(stderr, " current device: %d, in function %s at %s:%d\n", id, func, file, line);
223
+ fprintf(stderr, " %s\n", stmt);
224
+ // abort with GGML_ASSERT to get a stack trace
225
+ GGML_ASSERT(!"CUDA error");
226
+ }
227
+
228
+ #define CUDA_CHECK_GEN(err, success, error_fn) \
229
+ do { \
230
+ auto err_ = (err); \
231
+ if (err_ != (success)) { \
232
+ ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \
233
+ } \
234
  } while (0)
235
 
236
+ #define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
237
+
238
  #if CUDART_VERSION >= 12000
239
+ static const char * cublas_get_error_str(const cublasStatus_t err) {
240
+ return cublasGetStatusString(err);
241
+ }
 
 
 
 
 
 
 
 
 
242
  #else
243
+ static const char * cublas_get_error_str(const cublasStatus_t err) {
244
+ switch (err) {
245
+ case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
246
+ case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
247
+ case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
248
+ case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
249
+ case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
250
+ case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
251
+ case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
252
+ case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
253
+ case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
254
+ default: return "unknown error";
255
+ }
256
+ }
257
+ #endif // CUDART_VERSION >= 12000
258
+
259
+ #define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
260
+
261
+ #if !defined(GGML_USE_HIPBLAS)
262
+ static const char * cu_get_error_str(CUresult err) {
263
+ const char * err_str;
264
+ cuGetErrorString(err, &err_str);
265
+ return err_str;
266
+ }
267
+ #define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
268
+ #endif
269
 
270
  #if CUDART_VERSION >= 11100
271
  #define GGML_CUDA_ASSUME(x) __builtin_assume(x)
 
321
  typedef void (*ggml_cuda_op_mul_mat_t)(
322
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
323
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
324
+ const int64_t src1_padded_row_size, cudaStream_t stream);
325
  typedef void (*ggml_cuda_op_flatten_t)(
326
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
327
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream);
328
 
329
  // QK = number of values after dequantization
330
  // QR = QK / number of values before dequantization
 
530
 
531
  // this is faster on Windows
532
  // probably because the Windows CUDA libraries forget to make this check before invoking the drivers
533
+ static void ggml_cuda_set_device(const int device) {
534
  int current_device;
535
  CUDA_CHECK(cudaGetDevice(&current_device));
536
 
537
  if (device == current_device) {
538
+ return;
539
  }
540
 
541
+ CUDA_CHECK(cudaSetDevice(device));
542
  }
543
 
544
  static int g_device_count = -1;
545
  static int g_main_device = 0;
 
546
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
547
 
548
+ struct cuda_device_capabilities {
549
+ int cc; // compute capability
550
+ bool vmm; // virtual memory support
551
+ size_t vmm_granularity; // granularity of virtual memory
552
+ };
553
+
554
+ static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, false, 0} };
555
+
556
  static void * g_scratch_buffer = nullptr;
557
  static size_t g_scratch_size = 0; // disabled by default
558
  static size_t g_scratch_offset = 0;
 
594
 
595
  static __device__ __forceinline__ float op_repeat(const float a, const float b) {
596
  return b;
597
+ GGML_UNUSED(a);
598
  }
599
 
600
  static __device__ __forceinline__ float op_add(const float a, const float b) {
 
716
  dst[i] = x[i] / (1.0f + expf(-x[i]));
717
  }
718
 
719
+ static __global__ void gelu_quick_f32(const float * x, float * dst, int k) {
720
  const float GELU_QUICK_COEF = -1.702f;
721
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
722
  if (i >= k) {
 
725
  dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i])));
726
  }
727
 
728
+ static __global__ void tanh_f32(const float * x, float * dst, int k) {
729
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
730
  if (i >= k) {
731
  return;
 
742
  dst[i] = fmaxf(x[i], 0);
743
  }
744
 
745
+ static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) {
746
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
747
  if (i >= k) {
748
  return;
 
795
  }
796
  }
797
 
798
+ static __global__ void concat_f32(const float * x,const float * y, float * dst, const int ne0, const int ne02) {
799
  int nidx = threadIdx.x + blockIdx.x * blockDim.x;
800
  if (nidx >= ne0) {
801
  return;
 
820
  }
821
  }
822
 
823
+ static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int nb02, const int scale_factor) {
824
  int ne0 = ne00 * scale_factor;
825
  int nidx = threadIdx.x + blockIdx.x * blockDim.x;
826
  if (nidx >= ne0) {
 
840
  dst[offset_dst] = x[offset_src];
841
  }
842
 
843
+ static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02) {
844
  int nidx = threadIdx.x + blockIdx.x * blockDim.x;
845
  if (nidx >= ne0) {
846
  return;
 
4742
 
4743
  const int row_y = col_x;
4744
 
 
4745
  // y is not transposed but permuted
4746
  const int iy = channel*nrows_y + row_y;
4747
 
 
5416
  cne[3] = 1;
5417
  };
5418
 
5419
+ auto collapse_nb = [](size_t cnb[], const int64_t cne[]) {
5420
  cnb[1] *= cne[1];
5421
  cnb[2] *= cne[2];
5422
  cnb[3] *= cne[3];
 
5909
 
5910
  int id;
5911
  CUDA_CHECK(cudaGetDevice(&id));
5912
+ const int compute_capability = g_device_caps[id].cc;
5913
 
5914
  int mmq_x, mmq_y, nwarps;
5915
  if (compute_capability >= CC_RDNA2) {
 
5954
 
5955
  int id;
5956
  CUDA_CHECK(cudaGetDevice(&id));
5957
+ const int compute_capability = g_device_caps[id].cc;
5958
 
5959
  int mmq_x, mmq_y, nwarps;
5960
  if (compute_capability >= CC_RDNA2) {
 
5999
 
6000
  int id;
6001
  CUDA_CHECK(cudaGetDevice(&id));
6002
+ const int compute_capability = g_device_caps[id].cc;
6003
 
6004
  int mmq_x, mmq_y, nwarps;
6005
  if (compute_capability >= CC_RDNA2) {
 
6044
 
6045
  int id;
6046
  CUDA_CHECK(cudaGetDevice(&id));
6047
+ const int compute_capability = g_device_caps[id].cc;
6048
 
6049
  int mmq_x, mmq_y, nwarps;
6050
  if (compute_capability >= CC_RDNA2) {
 
6089
 
6090
  int id;
6091
  CUDA_CHECK(cudaGetDevice(&id));
6092
+ const int compute_capability = g_device_caps[id].cc;
6093
 
6094
  int mmq_x, mmq_y, nwarps;
6095
  if (compute_capability >= CC_RDNA2) {
 
6134
 
6135
  int id;
6136
  CUDA_CHECK(cudaGetDevice(&id));
6137
+ const int compute_capability = g_device_caps[id].cc;
6138
 
6139
  int mmq_x, mmq_y, nwarps;
6140
  if (compute_capability >= CC_RDNA2) {
 
6181
 
6182
  int id;
6183
  CUDA_CHECK(cudaGetDevice(&id));
6184
+ const int compute_capability = g_device_caps[id].cc;
6185
 
6186
  int mmq_x, mmq_y, nwarps;
6187
  if (compute_capability >= CC_RDNA2) {
 
6227
 
6228
  int id;
6229
  CUDA_CHECK(cudaGetDevice(&id));
6230
+ const int compute_capability = g_device_caps[id].cc;
6231
 
6232
  int mmq_x, mmq_y, nwarps;
6233
  if (compute_capability >= CC_RDNA2) {
 
6272
 
6273
  int id;
6274
  CUDA_CHECK(cudaGetDevice(&id));
6275
+ const int compute_capability = g_device_caps[id].cc;
6276
 
6277
  int mmq_x, mmq_y, nwarps;
6278
  if (compute_capability >= CC_RDNA2) {
 
6317
 
6318
  int id;
6319
  CUDA_CHECK(cudaGetDevice(&id));
6320
+ const int compute_capability = g_device_caps[id].cc;
6321
 
6322
  int mmq_x, mmq_y, nwarps;
6323
  if (compute_capability >= CC_RDNA2) {
 
6577
  scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
6578
  };
6579
 
6580
+ static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
6581
+
6582
+ // #define DEBUG_CUDA_MALLOC
6583
+ struct ggml_cuda_buffer {
6584
  void * ptr = nullptr;
6585
  size_t size = 0;
6586
  };
6587
 
6588
+ static ggml_cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
6589
+ static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
6590
 
6591
+ static void * ggml_cuda_pool_malloc_leg(int device, size_t size, size_t * actual_size) {
6592
  scoped_spin_lock lock(g_cuda_pool_lock);
 
 
6593
  #ifdef DEBUG_CUDA_MALLOC
6594
  int nnz = 0;
6595
+ size_t max_size = 0;
6596
  #endif
6597
  size_t best_diff = 1ull << 36;
6598
  int ibest = -1;
6599
  for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
6600
+ ggml_cuda_buffer& b = g_cuda_buffer_pool[device][i];
6601
  if (b.ptr != nullptr) {
6602
  #ifdef DEBUG_CUDA_MALLOC
6603
  ++nnz;
 
6604
  if (b.size > max_size) max_size = b.size;
6605
  #endif
6606
  if (b.size >= size) {
 
6620
  }
6621
  }
6622
  if (ibest >= 0) {
6623
+ ggml_cuda_buffer& b = g_cuda_buffer_pool[device][ibest];
6624
  void * ptr = b.ptr;
6625
  *actual_size = b.size;
6626
  b.ptr = nullptr;
6627
  b.size = 0;
6628
  return ptr;
6629
  }
 
 
 
 
6630
  void * ptr;
6631
  size_t look_ahead_size = (size_t) (1.05 * size);
6632
  look_ahead_size = 256 * ((look_ahead_size + 255)/256);
6633
+ ggml_cuda_set_device(device);
6634
  CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
6635
  *actual_size = look_ahead_size;
6636
+ g_cuda_pool_size[device] += look_ahead_size;
6637
+ #ifdef DEBUG_CUDA_MALLOC
6638
+ fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
6639
+ (uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
6640
+ #endif
6641
  return ptr;
6642
  }
6643
 
6644
+ static void ggml_cuda_pool_free_leg(int device, void * ptr, size_t size) {
6645
  scoped_spin_lock lock(g_cuda_pool_lock);
 
 
6646
 
6647
  for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
6648
+ ggml_cuda_buffer& b = g_cuda_buffer_pool[device][i];
6649
  if (b.ptr == nullptr) {
6650
  b.ptr = ptr;
6651
  b.size = size;
 
6653
  }
6654
  }
6655
  fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
6656
+ ggml_cuda_set_device(device);
6657
  CUDA_CHECK(cudaFree(ptr));
6658
+ g_cuda_pool_size[device] -= size;
6659
  }
6660
 
6661
+ #if !defined(GGML_USE_HIPBLAS)
6662
+ // pool with virtual memory
6663
+ static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
6664
+ static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
6665
+ static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
6666
+
6667
+ static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual_size) {
6668
+ scoped_spin_lock lock(g_cuda_pool_lock);
6669
+
6670
+ // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
6671
+ const size_t alignment = 128;
6672
+ size = alignment * ((size + alignment - 1) / alignment);
6673
+
6674
+ size_t avail = g_cuda_pool_size[device] - g_cuda_pool_used[device];
6675
+
6676
+ if (size > avail) {
6677
+ // round up to the next multiple of the granularity
6678
+ size_t reserve_size = size - avail;
6679
+ const size_t granularity = g_device_caps[device].vmm_granularity;
6680
+ reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
6681
+
6682
+ GGML_ASSERT(g_cuda_pool_size[device] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
6683
+
6684
+ // allocate more physical memory
6685
+ CUmemAllocationProp prop = {};
6686
+ prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
6687
+ prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
6688
+ prop.location.id = device;
6689
+ CUmemGenericAllocationHandle handle;
6690
+ CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
6691
+
6692
+ // reserve virtual address space (if not already reserved)
6693
+ if (g_cuda_pool_addr[device] == 0) {
6694
+ CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[device], CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
6695
+ }
6696
+
6697
+ // map at the end of the pool
6698
+ CU_CHECK(cuMemMap(g_cuda_pool_addr[device] + g_cuda_pool_size[device], reserve_size, 0, handle, 0));
6699
+
6700
+ // the memory allocation handle is no longer needed after mapping
6701
+ CU_CHECK(cuMemRelease(handle));
6702
+
6703
+ // set access
6704
+ CUmemAccessDesc access = {};
6705
+ access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
6706
+ access.location.id = device;
6707
+ access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
6708
+ CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[device] + g_cuda_pool_size[device], reserve_size, &access, 1));
6709
+
6710
+ // add to the pool
6711
+ g_cuda_pool_size[device] += reserve_size;
6712
+
6713
+ //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
6714
+ // id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
6715
+ // (unsigned long long) (reserve_size/1024/1024));
6716
+ }
6717
+
6718
+ GGML_ASSERT(g_cuda_pool_addr[device] != 0);
6719
+
6720
+ void * ptr = (void *) (g_cuda_pool_addr[device] + g_cuda_pool_used[device]);
6721
+ *actual_size = size;
6722
+ g_cuda_pool_used[device] += size;
6723
+
6724
+ #ifdef DEBUG_CUDA_MALLOC
6725
+ printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr);
6726
+ #endif
6727
+
6728
+ return ptr;
6729
+ }
6730
+
6731
+ static void ggml_cuda_pool_free_vmm(int device, void * ptr, size_t size) {
6732
+ scoped_spin_lock lock(g_cuda_pool_lock);
6733
+
6734
+ #ifdef DEBUG_CUDA_MALLOC
6735
+ printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr);
6736
+ #endif
6737
+
6738
+ g_cuda_pool_used[device] -= size;
6739
+
6740
+ // all deallocations must be in reverse order of the allocations
6741
+ GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[device] + g_cuda_pool_used[device]));
6742
+ }
6743
+
6744
+ static void * ggml_cuda_pool_malloc(int device, size_t size, size_t * actual_size) {
6745
+ if (g_device_caps[device].vmm) {
6746
+ return ggml_cuda_pool_malloc_vmm(device, size, actual_size);
6747
+ } else {
6748
+ return ggml_cuda_pool_malloc_leg(device, size, actual_size);
6749
+ }
6750
+ }
6751
+
6752
+ static void ggml_cuda_pool_free(int device, void * ptr, size_t size) {
6753
+ if (g_device_caps[device].vmm) {
6754
+ ggml_cuda_pool_free_vmm(device, ptr, size);
6755
+ } else {
6756
+ ggml_cuda_pool_free_leg(device, ptr, size);
6757
+ }
6758
+ }
6759
+ #else
6760
+ #define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg
6761
+ #define ggml_cuda_pool_free ggml_cuda_pool_free_leg
6762
+ #endif // !defined(GGML_USE_HIPBLAS)
6763
+
6764
+ template<typename T>
6765
+ struct cuda_pool_alloc {
6766
+ int device = -1;
6767
+ T * ptr = nullptr;
6768
+ size_t actual_size = 0;
6769
+
6770
+ // size is in number of elements
6771
+ T * alloc(size_t size) {
6772
+ GGML_ASSERT(ptr == nullptr);
6773
+ CUDA_CHECK(cudaGetDevice(&device));
6774
+ ptr = (T *) ggml_cuda_pool_malloc(device, size * sizeof(T), &this->actual_size);
6775
+ return ptr;
6776
+ }
6777
+
6778
+ cuda_pool_alloc(size_t size) {
6779
+ alloc(size);
6780
+ }
6781
+
6782
+ ~cuda_pool_alloc() {
6783
+ if (ptr != nullptr) {
6784
+ ggml_cuda_pool_free(device, ptr, actual_size);
6785
+ }
6786
+ }
6787
+
6788
+ T * get() {
6789
+ return ptr;
6790
+ }
6791
+
6792
+ cuda_pool_alloc() = default;
6793
+ cuda_pool_alloc(const cuda_pool_alloc &) = delete;
6794
+ cuda_pool_alloc(cuda_pool_alloc &&) = delete;
6795
+ cuda_pool_alloc& operator=(const cuda_pool_alloc &) = delete;
6796
+ cuda_pool_alloc& operator=(cuda_pool_alloc &&) = delete;
6797
+ };
6798
+
6799
  static bool g_cublas_loaded = false;
6800
 
6801
  bool ggml_cublas_loaded(void) {
 
6834
  #endif
6835
  fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
6836
  for (int id = 0; id < g_device_count; ++id) {
6837
+ int device_vmm = 0;
6838
+
6839
+ #if !defined(GGML_USE_HIPBLAS)
6840
+ CUdevice device;
6841
+ CU_CHECK(cuDeviceGet(&device, id));
6842
+ CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
6843
+
6844
+ if (device_vmm) {
6845
+ CUmemAllocationProp alloc_prop = {};
6846
+ alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
6847
+ alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
6848
+ alloc_prop.location.id = id;
6849
+ CU_CHECK(cuMemGetAllocationGranularity(&g_device_caps[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
6850
+ }
6851
+ #endif // !defined(GGML_USE_HIPBLAS)
6852
+ g_device_caps[id].vmm = !!device_vmm;
6853
+
6854
  cudaDeviceProp prop;
6855
  CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
6856
+ fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
6857
 
6858
  g_tensor_split[id] = total_vram;
6859
  total_vram += prop.totalGlobalMem;
6860
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
6861
+ g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
6862
  #else
6863
+ g_device_caps[id].cc = 100*prop.major + 10*prop.minor;
6864
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
6865
  }
6866
  for (int id = 0; id < g_device_count; ++id) {
 
6868
  }
6869
 
6870
  for (int id = 0; id < g_device_count; ++id) {
6871
+ ggml_cuda_set_device(id);
6872
 
6873
  // create cuda streams
6874
  for (int is = 0; is < MAX_STREAMS; ++is) {
 
6920
  void * ptr = nullptr;
6921
  cudaError_t err = cudaMallocHost((void **) &ptr, size);
6922
  if (err != cudaSuccess) {
6923
+ // clear the error
 
6924
  cudaGetLastError();
6925
  fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
6926
  size/1024.0/1024.0, cudaGetErrorString(err));
 
6983
 
6984
  static void ggml_cuda_op_get_rows(
6985
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
6986
+ const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t stream) {
6987
 
6988
  GGML_ASSERT(src1->type == GGML_TYPE_I32);
6989
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
 
7025
  }
7026
 
7027
  template<class op>
7028
+ static void ggml_cuda_op_bin_bcast(
7029
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7030
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7031
 
7032
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
7033
 
 
7046
 
7047
  static void ggml_cuda_op_repeat(
7048
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7049
+ const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t main_stream) {
7050
 
7051
  ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
7052
 
 
7054
  (void) src1_d;
7055
  }
7056
 
7057
+ static void ggml_cuda_op_add(
7058
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7059
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7060
 
7061
  ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
7062
  }
7063
 
7064
+ static void ggml_cuda_op_acc(
7065
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7066
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7067
 
7068
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7069
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
 
7080
  (void) dst;
7081
  }
7082
 
7083
+ static void ggml_cuda_op_mul(
7084
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7085
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7086
 
7087
  ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
7088
  }
7089
 
7090
+ static void ggml_cuda_op_div(
7091
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7092
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7093
 
7094
  ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
7095
  }
7096
 
7097
+ static void ggml_cuda_op_gelu(
7098
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7099
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7100
 
7101
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7102
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
7108
  (void) src1_dd;
7109
  }
7110
 
7111
+ static void ggml_cuda_op_silu(
7112
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7113
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7114
 
7115
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7116
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
7122
  (void) src1_dd;
7123
  }
7124
 
7125
+ static void ggml_cuda_op_gelu_quick(
7126
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7127
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7128
 
7129
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7130
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
7136
  (void) src1_dd;
7137
  }
7138
 
7139
+ static void ggml_cuda_op_tanh(
7140
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7141
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7142
 
7143
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7144
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
7150
  (void) src1_dd;
7151
  }
7152
 
7153
+ static void ggml_cuda_op_relu(
7154
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7155
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7156
 
7157
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7158
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
7164
  (void) src1_dd;
7165
  }
7166
 
7167
+ static void ggml_cuda_op_leaky_relu(
7168
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7169
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7170
 
7171
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7172
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
7181
  (void) src1_dd;
7182
  }
7183
 
7184
+ static void ggml_cuda_op_sqr(
7185
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7186
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7187
 
7188
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7189
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
7195
  (void) src1_dd;
7196
  }
7197
 
7198
+ static void ggml_cuda_op_norm(
7199
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7200
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7201
 
7202
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7203
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
7215
  (void) src1_dd;
7216
  }
7217
 
7218
+ static void ggml_cuda_op_group_norm(
 
7219
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7220
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7221
 
7222
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7223
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
7231
  (void) src1_dd;
7232
  }
7233
 
7234
+ static void ggml_cuda_op_concat(
7235
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7236
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7237
 
7238
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7239
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
 
7247
  (void) dst;
7248
  }
7249
 
7250
+ static void ggml_cuda_op_upscale(
7251
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7252
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7253
 
7254
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7255
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
 
7264
  (void) src1_dd;
7265
  }
7266
 
7267
+ static void ggml_cuda_op_pad(
7268
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7269
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7270
 
7271
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7272
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
 
7281
  (void) src1_dd;
7282
  }
7283
 
7284
+ static void ggml_cuda_op_rms_norm(
7285
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7286
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7287
 
7288
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7289
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
7301
  (void) src1_dd;
7302
  }
7303
 
7304
+ static void ggml_cuda_op_mul_mat_q(
7305
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
7306
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
7307
+ const int64_t src1_padded_row_size, cudaStream_t stream) {
7308
 
7309
  const int64_t ne00 = src0->ne[0];
7310
 
 
7366
  static int64_t get_row_rounding(ggml_type type) {
7367
  int64_t min_compute_capability = INT_MAX;
7368
  int64_t max_compute_capability = INT_MIN;
7369
+ for (int id = 0; id < g_device_count; ++id) {
7370
  if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
7371
+ if (min_compute_capability > g_device_caps[id].cc) {
7372
+ min_compute_capability = g_device_caps[id].cc;
7373
  }
7374
+ if (max_compute_capability < g_device_caps[id].cc) {
7375
+ max_compute_capability = g_device_caps[id].cc;
7376
  }
7377
  }
7378
  }
 
7424
  #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
7425
  }
7426
 
7427
+ static void ggml_cuda_op_mul_mat_vec_q(
7428
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
7429
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
7430
+ const int64_t src1_padded_row_size, cudaStream_t stream) {
7431
 
7432
  GGML_ASSERT(ggml_nrows(src1) == 1);
7433
 
 
7477
  (void) src1_padded_row_size;
7478
  }
7479
 
7480
+ static void ggml_cuda_op_dequantize_mul_mat_vec(
7481
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
7482
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
7483
+ const int64_t src1_padded_row_size, cudaStream_t stream) {
7484
 
7485
  const int64_t ne00 = src0->ne[0];
7486
  const int64_t row_diff = row_high - row_low;
7487
 
7488
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
7489
+
7490
  // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
7491
  #ifdef GGML_CUDA_F16
7492
+ cuda_pool_alloc<half> src1_dfloat_a;
7493
+ half * src1_dfloat = nullptr; // dfloat == half
7494
 
7495
  bool src1_convert_f16 =
7496
  src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
 
7498
  src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
7499
 
7500
  if (src1_convert_f16) {
7501
+ src1_dfloat = src1_dfloat_a.alloc(ne00);
7502
  ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
7503
  ne00, 1, sizeof(float), 0, 0,
7504
  ne00, 1, sizeof(half), 0, 0, stream);
 
7546
  break;
7547
  }
7548
 
 
 
 
 
 
 
7549
  (void) src1;
7550
  (void) dst;
7551
  (void) src1_ddq_i;
 
7553
  (void) src1_padded_row_size;
7554
  }
7555
 
7556
+ static void ggml_cuda_op_mul_mat_cublas(
7557
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
7558
  const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
7559
+ const int64_t src1_padded_row_size, cudaStream_t stream) {
7560
 
7561
  GGML_ASSERT(src0_dd_i != nullptr);
7562
  GGML_ASSERT(src1_ddf_i != nullptr);
 
7576
  // ldc == nrows of the matrix that cuBLAS writes into
7577
  int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
7578
 
7579
+ const int compute_capability = g_device_caps[id].cc;
7580
 
7581
  if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
7582
+ //printf("this branch\n");
7583
  // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
7584
+ cuda_pool_alloc<half> src0_as_f16;
 
7585
  if (src0->type != GGML_TYPE_F16) {
7586
  const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
7587
  GGML_ASSERT(to_fp16_cuda != nullptr);
7588
  size_t ne = row_diff*ne00;
7589
+ src0_as_f16.alloc(ne);
7590
+ to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
7591
  }
7592
+ const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
7593
 
7594
+ cuda_pool_alloc<half> src1_as_f16;
 
7595
  if (src1->type != GGML_TYPE_F16) {
7596
  const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
7597
  GGML_ASSERT(to_fp16_cuda != nullptr);
7598
  size_t ne = src1_ncols*ne10;
7599
+ src1_as_f16.alloc(ne);
7600
+ to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
7601
  }
7602
+ const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
7603
+ cuda_pool_alloc<half> dst_f16(row_diff*src1_ncols);
 
7604
 
7605
  const half alpha_f16 = 1.0f;
7606
  const half beta_f16 = 0.0f;
 
7609
  CUBLAS_CHECK(
7610
  cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7611
  row_diff, src1_ncols, ne10,
7612
+ &alpha_f16, src0_ptr, CUDA_R_16F, ne00,
7613
+ src1_ptr, CUDA_R_16F, ne10,
7614
+ &beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
7615
  CUBLAS_COMPUTE_16F,
7616
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7617
 
7618
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
7619
+ to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
7620
+ } else {
7621
+ cuda_pool_alloc<float> src0_ddq_as_f32;
7622
+ cuda_pool_alloc<float> src1_ddq_as_f32;
 
 
 
 
 
 
 
 
 
 
 
7623
 
7624
  if (src0->type != GGML_TYPE_F32) {
7625
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
7626
  GGML_ASSERT(to_fp32_cuda != nullptr);
7627
+ src0_ddq_as_f32.alloc(row_diff*ne00);
7628
+ to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
7629
+ }
7630
+ if (src1->type != GGML_TYPE_F32) {
7631
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src1->type);
7632
+ GGML_ASSERT(to_fp32_cuda != nullptr);
7633
+ src1_ddq_as_f32.alloc(src1_ncols*ne10);
7634
+ to_fp32_cuda(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream);
7635
  }
7636
+
7637
+ const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
7638
+ const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
7639
 
7640
  const float alpha = 1.0f;
7641
  const float beta = 0.0f;
 
7644
  CUBLAS_CHECK(
7645
  cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7646
  row_diff, src1_ncols, ne10,
7647
+ &alpha, src0_ddf_i, ne00,
7648
+ src1_ddf1_i, ne10,
7649
+ &beta, dst_dd_i, ldc));
 
 
 
 
7650
  }
7651
 
7652
  (void) dst;
 
7654
  (void) src1_padded_row_size;
7655
  }
7656
 
7657
+ static void ggml_cuda_op_rope(
7658
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7659
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7660
 
7661
  GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
7662
  GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
 
7734
  (void) src1_dd;
7735
  }
7736
 
7737
+ static void ggml_cuda_op_alibi(
7738
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7739
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7740
 
7741
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7742
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
7765
  (void) src1_dd;
7766
  }
7767
 
7768
+ static void ggml_cuda_op_im2col(
7769
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7770
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7771
 
7772
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
7773
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
 
7800
  (void) src0_dd;
7801
  }
7802
 
7803
+ static void ggml_cuda_op_sum_rows(
 
7804
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7805
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7806
 
7807
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7808
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
7817
  (void) src1_dd;
7818
  }
7819
 
7820
+ static void ggml_cuda_op_argsort(
7821
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7822
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7823
 
7824
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7825
  GGML_ASSERT( dst->type == GGML_TYPE_I32);
 
7836
  (void) src1_dd;
7837
  }
7838
 
7839
+ static void ggml_cuda_op_diag_mask_inf(
7840
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7841
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7842
 
7843
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7844
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
7856
  (void) src1_dd;
7857
  }
7858
 
7859
+ static void ggml_cuda_op_soft_max(
7860
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7861
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7862
 
7863
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7864
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
7877
  (void) dst;
7878
  }
7879
 
7880
+ static void ggml_cuda_op_scale(
7881
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7882
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7883
 
7884
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7885
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
7895
  (void) src1_dd;
7896
  }
7897
 
7898
+ static void ggml_cuda_op_clamp(
7899
  const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
7900
+ const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) {
7901
 
7902
  GGML_ASSERT(src0->type == GGML_TYPE_F32);
7903
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
7937
  float * src1_ddf = nullptr;
7938
  float * dst_ddf = nullptr;
7939
 
7940
+ cuda_pool_alloc<float> src0_f;
7941
+ cuda_pool_alloc<float> src1_f;
7942
+ cuda_pool_alloc<float> dst_f;
 
7943
 
7944
  ggml_cuda_set_device(g_main_device);
7945
+ cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
7946
 
7947
  if (src0_on_device) {
7948
  src0_ddf = (float *) src0_extra->data_device[g_main_device];
7949
  } else {
7950
+ src0_ddf = src0_f.alloc(ggml_nelements(src0));
7951
  CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
7952
  }
7953
 
 
7955
  if (src1_on_device) {
7956
  src1_ddf = (float *) src1_extra->data_device[g_main_device];
7957
  } else {
7958
+ src1_ddf = src1_f.alloc(ggml_nelements(src1));
7959
  CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
7960
  }
7961
  }
7962
  if (dst_on_device) {
7963
  dst_ddf = (float *) dst_extra->data_device[g_main_device];
7964
  } else {
7965
+ dst_ddf = dst_f.alloc(ggml_nelements(dst));
7966
  }
7967
 
7968
  // do the computation
 
7974
  CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
7975
  }
7976
 
 
 
 
 
 
 
 
 
 
 
7977
  if (dst->backend == GGML_BACKEND_CPU) {
7978
  CUDA_CHECK(cudaDeviceSynchronize());
7979
  }
 
7990
 
7991
  #ifdef NDEBUG
7992
  for (int id = 0; id < g_device_count; ++id) {
7993
+ ggml_cuda_set_device(id);
7994
  CUDA_CHECK(cudaDeviceSynchronize());
7995
  }
7996
 
7997
  for (int id = 0; id < g_device_count; ++id) {
7998
+ ggml_cuda_set_device(id);
7999
 
8000
  for (int id_other = 0; id_other < g_device_count; ++id_other) {
8001
  if (id == id_other) {
 
8029
  const int64_t ne01 = src0->ne[1];
8030
  const int64_t ne02 = src0->ne[2];
8031
  const int64_t ne03 = src0->ne[3];
 
8032
 
8033
  const int64_t ne10 = src1->ne[0];
8034
  const int64_t ne11 = src1->ne[1];
 
8046
 
8047
  GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
8048
  GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
8049
+ GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1));
8050
 
8051
  GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
8052
 
 
8072
  GGML_ASSERT(!(split && ne03 > 1));
8073
  GGML_ASSERT(!(split && ne02 < ne12));
8074
 
8075
+ struct dev_data {
8076
+ cuda_pool_alloc<char> src0_dd_alloc;
8077
+ cuda_pool_alloc<float> src1_ddf_alloc;
8078
+ cuda_pool_alloc<char> src1_ddq_alloc;
8079
+ cuda_pool_alloc<float> dst_dd_alloc;
8080
+
8081
+ char * src0_dd = nullptr;
8082
+ float * src1_ddf = nullptr; // float
8083
+ char * src1_ddq = nullptr; // q8_1
8084
+ float * dst_dd = nullptr;
8085
 
8086
+ int64_t row_low;
8087
+ int64_t row_high;
8088
+ };
 
 
8089
 
8090
+ dev_data dev[GGML_CUDA_MAX_DEVICES];
 
8091
 
8092
  int used_devices = 0;
8093
 
8094
+ for (int id = 0; id < g_device_count; ++id) {
8095
  // by default, use all rows
8096
+ dev[id].row_low = 0;
8097
+ dev[id].row_high = ne01;
8098
 
8099
  // for multi GPU, get the row boundaries from tensor split
8100
  // and round to mul_mat_q tile sizes
 
8102
  const int64_t rounding = get_row_rounding(src0->type);
8103
 
8104
  if (id != 0) {
8105
+ dev[id].row_low = ne01*g_tensor_split[id];
8106
+ if (dev[id].row_low < ne01) {
8107
+ dev[id].row_low -= dev[id].row_low % rounding;
8108
+ }
8109
  }
8110
 
8111
  if (id != g_device_count - 1) {
8112
+ dev[id].row_high = ne01*g_tensor_split[id + 1];
8113
+ if (dev[id].row_high < ne01) {
8114
+ dev[id].row_high -= dev[id].row_high % rounding;
8115
+ }
8116
  }
8117
  }
8118
  }
8119
 
8120
+ for (int id = 0; id < g_device_count; ++id) {
8121
+ if ((!split && id != g_main_device) || dev[id].row_low == dev[id].row_high) {
8122
  continue;
8123
  }
8124
 
 
8128
  const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
8129
 
8130
  ggml_cuda_set_device(id);
8131
+ cudaStream_t stream = g_cudaStreams[id][0];
8132
 
8133
  if (src0_on_device && src0_is_contiguous) {
8134
+ dev[id].src0_dd = (char *) src0_extra->data_device[id];
8135
  } else {
8136
+ dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ggml_nbytes(src0));
 
8137
  }
8138
 
8139
  if (src1_on_device && src1_is_contiguous) {
8140
+ dev[id].src1_ddf = (float *) src1_extra->data_device[id];
8141
  } else {
8142
+ dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ggml_nelements(src1));
8143
  }
8144
 
8145
  if (convert_src1_to_q8_1) {
8146
+ dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
8147
 
8148
  if (src1_on_device && src1_is_contiguous) {
8149
+ quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
8150
  CUDA_CHECK(cudaGetLastError());
8151
  }
8152
  }
8153
 
8154
  if (dst_on_device) {
8155
+ dev[id].dst_dd = (float *) dst_extra->data_device[id];
8156
  } else {
8157
+ const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst);
8158
+ dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(size_dst_ddf);
8159
  }
8160
  }
8161
 
8162
  // if multiple devices are used they need to wait for the main device
8163
  // here an event is recorded that signals that the main device has finished calculating the input data
8164
  if (split && used_devices > 1) {
8165
+ ggml_cuda_set_device(g_main_device);
8166
  CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
8167
  }
8168
 
 
8171
  const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
8172
  const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
8173
 
8174
+ for (int id = 0; id < g_device_count; ++id) {
8175
+ if ((!split && id != g_main_device) || dev[id].row_low == dev[id].row_high) {
8176
  continue;
8177
  }
8178
 
8179
  const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
8180
  const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
8181
+ const int64_t row_diff = dev[id].row_high - dev[id].row_low;
8182
 
8183
  ggml_cuda_set_device(id);
8184
+ cudaStream_t stream = g_cudaStreams[id][is];
8185
 
8186
  // wait for main GPU data if necessary
8187
  if (split && (id != g_main_device || is != 0)) {
 
8195
  const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
8196
 
8197
  // for split tensors the data begins at i0 == i0_offset_low
8198
+ char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
8199
+ float * src1_ddf_i = dev[id].src1_ddf + (i0*ne11 + src1_col_0) * ne10;
8200
+ char * src1_ddq_i = dev[id].src1_ddq + src1_ddq_i_offset;
8201
+ float * dst_dd_i = dev[id].dst_dd + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff);
8202
 
8203
  // the main device memory buffer can be on VRAM scratch, with space for all partial results
8204
  // in that case an offset on dst_ddf_i is needed
8205
  if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
8206
+ dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
8207
  }
8208
 
8209
  // copy src0, src1 to device if necessary
8210
  if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
8211
  if (id != g_main_device) {
8212
  if (convert_src1_to_q8_1) {
8213
+ char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset;
8214
+ CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, g_main_device,
8215
+ src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
8216
  } else {
8217
  float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
8218
  src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
8219
+ CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, g_main_device,
8220
+ src1_ncols*ne10*sizeof(float), stream));
8221
  }
8222
  }
8223
  } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
8224
  CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
8225
+ src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
8226
  } else {
8227
  GGML_ASSERT(false);
8228
  }
 
8233
  }
8234
 
8235
  if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
8236
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
8237
  }
8238
 
8239
  // do the computation
8240
  op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
8241
+ dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream);
8242
  CUDA_CHECK(cudaGetLastError());
8243
 
8244
  // copy dst to host or other device if necessary
 
8262
  // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
8263
  float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
8264
  GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
8265
+ dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
8266
+ #if !defined(GGML_USE_HIPBLAS)
8267
+ if (kind == cudaMemcpyDeviceToDevice) {
8268
+ // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
8269
+ cudaMemcpy3DPeerParms p = {};
8270
+ p.dstDevice = g_main_device;
8271
+ p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
8272
+ p.srcDevice = id;
8273
+ p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
8274
+ p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
8275
+ CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
8276
+ } else
8277
+ #endif
8278
+ {
8279
+ CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
8280
+ dst_dd_i, row_diff*sizeof(float),
8281
+ row_diff*sizeof(float), src1_ncols,
8282
+ kind, stream));
8283
+ }
8284
  } else {
8285
  float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
8286
  GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
 
8297
  }
8298
  }
8299
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8300
  // main device waits for all other devices to be finished
8301
  if (split && g_device_count > 1) {
8302
  int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
8303
  is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
8304
 
8305
+ ggml_cuda_set_device(g_main_device);
8306
+ for (int id = 0; id < g_device_count; ++id) {
8307
+ if (dev[id].row_low == dev[id].row_high) {
8308
  continue;
8309
  }
8310
  for (int64_t is = 0; is < is_max; ++is) {
 
8314
  }
8315
 
8316
  if (dst->backend == GGML_BACKEND_CPU) {
8317
+ ggml_cuda_set_device(g_main_device);
8318
  CUDA_CHECK(cudaDeviceSynchronize());
8319
  }
8320
  }
 
8424
 
8425
  const int64_t ne12 = src1->ne[2];
8426
 
8427
+ ggml_cuda_set_device(g_main_device);
8428
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
8429
 
8430
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
 
8456
 
8457
  const int64_t ne12 = src1->ne[2];
8458
 
8459
+ ggml_cuda_set_device(g_main_device);
8460
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
8461
 
8462
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
 
8493
  int64_t i03 = i13 / r3;
8494
  int64_t i02 = i12 / r2;
8495
 
8496
+ ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
8497
+ ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
8498
+ ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3;
8499
  }
8500
 
8501
  static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
8504
 
8505
  GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
8506
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8507
 
8508
+ GGML_TENSOR_BINARY_OP_LOCALS
 
 
8509
 
8510
+ const int64_t ne_dst = ggml_nelements(dst);
 
8511
 
8512
+ ggml_cuda_set_device(g_main_device);
8513
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
8514
 
8515
  CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
8516
 
8517
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
8518
  void * src0_ddq = src0_extra->data_device[g_main_device];
8519
+ half * src0_f16 = (half *) src0_ddq;
8520
 
8521
  ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
8522
  float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
 
8525
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
8526
 
8527
  // convert src1 to fp16
8528
+ cuda_pool_alloc<half> src1_f16_alloc;
8529
+ if (src1->type != GGML_TYPE_F16) {
8530
+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
8531
+ const int64_t ne_src1 = ggml_nelements(src1);
8532
+ src1_f16_alloc.alloc(ne_src1);
8533
+ GGML_ASSERT(to_fp16_cuda != nullptr);
8534
+ to_fp16_cuda(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream);
8535
+ }
8536
+ half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get();
8537
 
8538
+ cuda_pool_alloc<half> dst_f16;
8539
+ char * dst_t;
8540
 
8541
  cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
8542
  cudaDataType_t cu_data_type = CUDA_R_16F;
 
8555
  const void * beta = &beta_f16;
8556
 
8557
  if (dst->op_params[0] == GGML_PREC_DEFAULT) {
8558
+ dst_t = (char *) dst_f16.alloc(ne_dst);
 
8559
 
8560
  nbd2 /= sizeof(float) / sizeof(half);
8561
  nbd3 /= sizeof(float) / sizeof(half);
 
8602
  CUBLAS_CHECK(
8603
  cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8604
  ne01, ne11, ne10,
8605
+ alpha, (const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
8606
+ (const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB
8607
+ beta, ( char *) dst_t, cu_data_type, ne01, nb2/nb0, // strideC
8608
  ne12*ne13,
8609
  cu_compute_type,
8610
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 
8612
  // use cublasGemmBatchedEx
8613
  const int ne23 = ne12*ne13;
8614
 
8615
+ cuda_pool_alloc<const void *> ptrs_src(2*ne23);
8616
+ cuda_pool_alloc< void *> ptrs_dst(1*ne23);
 
 
 
 
 
 
8617
 
8618
  dim3 block_dims(ne13, ne12);
8619
  k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
8620
+ src0_f16, src1_f16, dst_t,
8621
+ ptrs_src.get(), ptrs_dst.get(),
8622
  ne12, ne13,
8623
  ne23,
8624
  nb02, nb03,
8625
+ src1->type == GGML_TYPE_F16 ? nb12 : nb12/2,
8626
+ src1->type == GGML_TYPE_F16 ? nb13 : nb13/2,
8627
  nbd2, nbd3,
8628
  r2, r3);
8629
  CUDA_CHECK(cudaGetLastError());
 
8631
  CUBLAS_CHECK(
8632
  cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
8633
  ne01, ne11, ne10,
8634
+ alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00,
8635
+ (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/nb10,
8636
+ beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01,
8637
  ne23,
8638
  cu_compute_type,
8639
  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 
 
 
 
 
 
 
8640
  }
8641
  #endif
8642
 
8643
  if (dst->op_params[0] == GGML_PREC_DEFAULT) {
8644
  const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
8645
+ to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream);
 
 
8646
  }
 
 
8647
  }
8648
 
8649
  static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
8655
  const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
8656
 
8657
  int64_t min_compute_capability = INT_MAX;
8658
+ for (int id = 0; id < g_device_count; ++id) {
8659
+ if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
8660
+ min_compute_capability = g_device_caps[id].cc;
8661
  }
8662
  }
8663
 
 
8681
  } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
8682
  // KQV single-batch
8683
  ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
8684
+ } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
8685
  // KQ + KQV multi-batch
8686
  ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
8687
  } else if (src0->type == GGML_TYPE_F32) {
8688
  ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
8689
  } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
8690
+ if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->type == GGML_TYPE_F32) {
8691
  #ifdef GGML_CUDA_FORCE_DMMV
8692
  const bool use_mul_mat_vec_q = false;
8693
  #else
 
8798
  const int64_t ne1 = ggml_nelements(src1);
8799
  const int64_t ne = ggml_nelements(dst);
8800
 
8801
+ ggml_cuda_set_device(g_main_device);
8802
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
8803
 
8804
  CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
 
8916
 
8917
  std::vector<char> ids_host(ggml_nbytes(ids));
8918
 
8919
+ cudaStream_t stream = g_cudaStreams[g_main_device][0];
8920
 
8921
  if (ids->backend == GGML_BACKEND_GPU) {
8922
  const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
 
8970
  ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
8971
  }
8972
  } else {
8973
+ cuda_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
8974
+ cuda_pool_alloc<char> dst_contiguous(sizeof(float)*ggml_nelements(dst));
 
8975
 
8976
+ src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
8977
+ dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
8978
 
8979
  const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
8980
  cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
 
8994
 
8995
  GGML_ASSERT(row_id >= 0 && row_id < n_as);
8996
 
8997
+ CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11,
8998
  nb11, src1_kind, stream));
8999
  num_src1_rows++;
9000
  }
 
9026
 
9027
  GGML_ASSERT(row_id >= 0 && row_id < n_as);
9028
 
9029
+ CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
9030
  nb1, dst_kind, stream));
9031
  num_src1_rows++;
9032
  }
9033
  }
 
 
 
9034
  }
9035
 
9036
  if (dst->backend == GGML_BACKEND_CPU) {
 
9072
  const int64_t nb11 = src1->nb[1];
9073
  const int64_t nb12 = src1->nb[2];
9074
 
9075
+ ggml_cuda_set_device(g_main_device);
9076
  cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
9077
 
9078
  const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
 
9162
  ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
9163
  memset(extra, 0, sizeof(*extra));
9164
 
9165
+ for (int id = 0; id < g_device_count; ++id) {
9166
  if (backend == GGML_BACKEND_GPU && id != g_main_device) {
9167
  continue;
9168
  }
 
9233
 
9234
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
9235
 
9236
+ for (int id = 0; id < g_device_count; ++id) {
9237
+ ggml_cuda_set_device(id);
9238
  if (extra->data_device[id] != nullptr) {
 
9239
  CUDA_CHECK(cudaFree(extra->data_device[id]));
9240
  }
9241
 
9242
  for (int64_t is = 0; is < MAX_STREAMS; ++is) {
9243
  if (extra->events[id][is] != nullptr) {
 
9244
  CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
9245
  }
9246
  }
 
9294
  force_inplace;
9295
  const size_t size = ggml_nbytes(tensor);
9296
 
9297
+ ggml_cuda_set_device(g_main_device);
9298
  if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
9299
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
9300
  char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
 
9371
  GGML_ASSERT(ggml_is_contiguous(tensor));
9372
 
9373
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
9374
+ ggml_cuda_set_device(g_main_device);
9375
  CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
9376
  }
9377
 
 
9795
  // host buffer type
9796
 
9797
  static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
9798
+ ggml_cuda_host_free(buffer->context);
9799
  }
9800
 
9801
  static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
9802
+ void * ptr = ggml_cuda_host_malloc(size);
9803
+
9804
+ if (ptr == nullptr) {
9805
+ // fallback to cpu buffer
9806
+ return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
9807
+ }
9808
 
9809
  // FIXME: this is a hack to avoid having to implement a new buffer type
9810
  ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
ggml-quants.c CHANGED
@@ -407,6 +407,18 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
407
  #define ggml_vld1q_s8_x4 vld1q_s8_x4
408
 
409
  #endif
 
 
 
 
 
 
 
 
 
 
 
 
410
  #endif
411
 
412
  #if defined(__ARM_NEON) || defined(__wasm_simd128__)
@@ -2468,32 +2480,12 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
2468
  const int8x16_t v1_1l = vld1q_s8(y1->qs);
2469
  const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
2470
 
2471
- #if defined(__ARM_FEATURE_DOTPROD)
2472
  // dot product into int32x4_t
2473
  const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
2474
  const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
2475
 
2476
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
2477
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
2478
- #else
2479
- const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
2480
- const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
2481
- const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h));
2482
- const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h));
2483
-
2484
- const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l));
2485
- const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l));
2486
- const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h));
2487
- const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h));
2488
-
2489
- const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
2490
- const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
2491
- const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
2492
- const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
2493
-
2494
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
2495
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
2496
- #endif
2497
  }
2498
 
2499
  *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
@@ -2776,32 +2768,12 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
2776
  const int8x16_t v1_1l = vld1q_s8(y1->qs);
2777
  const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
2778
 
2779
- #if defined(__ARM_FEATURE_DOTPROD)
2780
  // dot product into int32x4_t
2781
  const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
2782
  const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
2783
 
2784
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
2785
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
2786
- #else
2787
- const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l));
2788
- const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l));
2789
- const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0h));
2790
- const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0h));
2791
-
2792
- const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1l));
2793
- const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1l));
2794
- const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1h));
2795
- const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1h));
2796
-
2797
- const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
2798
- const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
2799
- const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
2800
- const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
2801
-
2802
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
2803
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
2804
- #endif
2805
  }
2806
 
2807
  *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
@@ -2963,32 +2935,12 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
2963
  const int8x16_t v1_1l = vld1q_s8(y1->qs);
2964
  const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
2965
 
2966
- #if defined(__ARM_FEATURE_DOTPROD)
2967
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
2968
  vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
2969
  vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
2970
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
2971
  vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
2972
  vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
2973
- #else
2974
- const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
2975
- const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
2976
- const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
2977
- const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
2978
-
2979
- const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
2980
- const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
2981
- const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
2982
- const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
2983
-
2984
- const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
2985
- const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
2986
- const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
2987
- const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
2988
-
2989
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
2990
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
2991
- #endif
2992
  }
2993
 
2994
  *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
@@ -3275,32 +3227,12 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
3275
  const int8x16_t v1_1l = vld1q_s8(y1->qs);
3276
  const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
3277
 
3278
- #if defined(__ARM_FEATURE_DOTPROD)
3279
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
3280
  vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
3281
  vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
3282
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
3283
  vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
3284
  vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
3285
- #else
3286
- const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
3287
- const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
3288
- const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
3289
- const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
3290
-
3291
- const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
3292
- const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
3293
- const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
3294
- const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
3295
-
3296
- const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
3297
- const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
3298
- const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
3299
- const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
3300
-
3301
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
3302
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
3303
- #endif
3304
  }
3305
 
3306
  *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
@@ -3550,7 +3482,6 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
3550
  const int8x16_t y1_0 = vld1q_s8(y1->qs);
3551
  const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
3552
 
3553
- #if defined(__ARM_FEATURE_DOTPROD)
3554
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
3555
  vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
3556
  vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
@@ -3558,26 +3489,6 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
3558
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
3559
  vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
3560
  vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
3561
-
3562
- #else
3563
- const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
3564
- const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0));
3565
- const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1));
3566
- const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1));
3567
-
3568
- const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0));
3569
- const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0));
3570
- const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1));
3571
- const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1));
3572
-
3573
- const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1));
3574
- const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3));
3575
- const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
3576
- const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
3577
-
3578
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
3579
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
3580
- #endif
3581
  }
3582
 
3583
  *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
@@ -3650,12 +3561,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
3650
  const int nb = n / QK_K;
3651
 
3652
  #ifdef __ARM_NEON
3653
-
3654
  const uint8x16_t m3 = vdupq_n_u8(0x3);
3655
  const uint8x16_t m4 = vdupq_n_u8(0xF);
3656
- #if defined(__ARM_FEATURE_DOTPROD)
3657
- const int32x4_t vzero = vdupq_n_s32(0);
3658
- #endif
3659
 
3660
  ggml_int8x16x2_t q2bytes;
3661
  uint8_t aux[16];
@@ -3663,7 +3572,6 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
3663
  float sum = 0;
3664
 
3665
  for (int i = 0; i < nb; ++i) {
3666
-
3667
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
3668
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
3669
 
@@ -3689,20 +3597,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
3689
 
3690
  // We use this macro instead of a function call because for some reason
3691
  // the code runs 2-3% slower, even if the function is declared inline
3692
- #if defined(__ARM_FEATURE_DOTPROD)
3693
  #define MULTIPLY_ACCUM_WITH_SCALE(index)\
3694
  isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
3695
  isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
3696
- #else
3697
- #define MULTIPLY_ACCUM_WITH_SCALE(index)\
3698
- {\
3699
- const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[0]), vget_low_s8 (q8bytes.val[0])),\
3700
- vmull_s8(vget_high_s8(q2bytes.val[0]), vget_high_s8(q8bytes.val[0])));\
3701
- const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[1]), vget_low_s8 (q8bytes.val[1])),\
3702
- vmull_s8(vget_high_s8(q2bytes.val[1]), vget_high_s8(q8bytes.val[1])));\
3703
- isum += vaddvq_s16(p1) * aux[is+(index)] + vaddvq_s16(p2) * aux[is+1+(index)];\
3704
- }
3705
- #endif
3706
 
3707
  #define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
3708
  q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
@@ -3710,26 +3607,23 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
3710
  q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
3711
  MULTIPLY_ACCUM_WITH_SCALE((index));
3712
 
3713
-
3714
  for (int j = 0; j < QK_K/128; ++j) {
3715
-
3716
  const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32;
3717
 
3718
  ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
3719
  q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
3720
  q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
 
3721
  MULTIPLY_ACCUM_WITH_SCALE(0);
3722
 
3723
  SHIFT_MULTIPLY_ACCUM_WITH_SCALE(2, 2);
3724
-
3725
  SHIFT_MULTIPLY_ACCUM_WITH_SCALE(4, 4);
3726
-
3727
  SHIFT_MULTIPLY_ACCUM_WITH_SCALE(6, 6);
3728
 
3729
  is += 8;
3730
  }
3731
- sum += d * isum;
3732
 
 
3733
  }
3734
 
3735
  *s = sum;
@@ -4043,11 +3937,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
4043
  const int nb = n / QK_K;
4044
 
4045
  #ifdef __ARM_NEON
4046
-
4047
  const uint8x16_t m3 = vdupq_n_u8(0x3);
4048
- #if defined(__ARM_FEATURE_DOTPROD)
4049
- const int32x4_t vzero = vdupq_n_s32(0);
4050
- #endif
4051
 
4052
  ggml_int8x16x4_t q2bytes;
4053
 
@@ -4081,28 +3973,12 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
4081
  q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3));
4082
  q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3));
4083
 
4084
- #if defined(__ARM_FEATURE_DOTPROD)
4085
  isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
4086
  isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
4087
  isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
4088
  isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
4089
- #else
4090
- const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
4091
- vmull_s8(vget_high_s8(q2bytes.val[0]), vget_high_s8(q8bytes.val[0])));
4092
- const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
4093
- vmull_s8(vget_high_s8(q2bytes.val[1]), vget_high_s8(q8bytes.val[1])));
4094
- isum1 += vaddvq_s16(p1) * scales[0];
4095
- isum2 += vaddvq_s16(p2) * scales[1];
4096
-
4097
- const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
4098
- vmull_s8(vget_high_s8(q2bytes.val[2]), vget_high_s8(q8bytes.val[2])));
4099
- const int16x8_t p4 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
4100
- vmull_s8(vget_high_s8(q2bytes.val[3]), vget_high_s8(q8bytes.val[3])));
4101
- isum1 += vaddvq_s16(p3) * scales[2];
4102
- isum2 += vaddvq_s16(p4) * scales[3];
4103
- #endif
4104
- sum += d * (isum1 + isum2);
4105
 
 
4106
  }
4107
 
4108
  *s = sum;
@@ -4328,9 +4204,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
4328
  uint32_t utmp[4];
4329
 
4330
  const uint8x16_t m3b = vdupq_n_u8(0x3);
4331
- #ifdef __ARM_FEATURE_DOTPROD
4332
  const int32x4_t vzero = vdupq_n_s32(0);
4333
- #endif
4334
 
4335
  const uint8x16_t m0 = vdupq_n_u8(1);
4336
  const uint8x16_t m1 = vshlq_n_u8(m0, 1);
@@ -4382,22 +4256,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
4382
  q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
4383
  q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
4384
 
4385
- #if defined(__ARM_FEATURE_DOTPROD)
4386
  isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
4387
  isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
4388
  isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
4389
  isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
4390
- #else
4391
- int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes_1.val[0])),
4392
- vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes_1.val[0])));
4393
- int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes_1.val[1])),
4394
- vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes_1.val[1])));
4395
- int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes_1.val[2])),
4396
- vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes_1.val[2])));
4397
- int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes_1.val[3])),
4398
- vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes_1.val[3])));
4399
- isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1] + vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3];
4400
- #endif
4401
  scale += 4;
4402
 
4403
  q3h.val[0] = vbicq_u8(m2, qhbits.val[0]);
@@ -4410,22 +4273,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
4410
  q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
4411
  q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
4412
 
4413
- #if defined(__ARM_FEATURE_DOTPROD)
4414
  isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
4415
  isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
4416
  isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
4417
  isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
4418
- #else
4419
- p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes_2.val[0])),
4420
- vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes_2.val[0])));
4421
- p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes_2.val[1])),
4422
- vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes_2.val[1])));
4423
- p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes_2.val[2])),
4424
- vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes_2.val[2])));
4425
- p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes_2.val[3])),
4426
- vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes_2.val[3])));
4427
- isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1] + vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3];
4428
- #endif
4429
  scale += 4;
4430
 
4431
  if (j == 0) {
@@ -4864,10 +4716,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
4864
  const int nb = n / QK_K;
4865
 
4866
  #ifdef __ARM_NEON
4867
-
4868
- #ifdef __ARM_FEATURE_DOTPROD
4869
- const int32x4_t vzero = vdupq_n_s32(0);
4870
- #endif
4871
 
4872
  const uint8x16_t m3b = vdupq_n_u8(0x3);
4873
  const uint8x16_t mh = vdupq_n_u8(4);
@@ -4908,22 +4757,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
4908
  q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2]));
4909
  q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6), q3h.val[3]));
4910
 
4911
- #if defined(__ARM_FEATURE_DOTPROD)
4912
  isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
4913
  isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
4914
  isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
4915
  isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
4916
- #else
4917
- const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
4918
- vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes.val[0])));
4919
- const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
4920
- vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes.val[1])));
4921
- const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
4922
- vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes.val[2])));
4923
- const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
4924
- vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes.val[3])));
4925
- isum += vaddvq_s16(p0) * scales[0] + vaddvq_s16(p1) * scales[2] + vaddvq_s16(p2) * scales[1] + vaddvq_s16(p3) * scales[3];
4926
- #endif
4927
 
4928
  sum += d * isum;
4929
 
@@ -5228,11 +5065,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
5228
  uint32_t utmp[4];
5229
 
5230
  #ifdef __ARM_NEON
5231
-
5232
  const uint8x16_t m4b = vdupq_n_u8(0xf);
5233
- #ifdef __ARM_FEATURE_DOTPROD
5234
  const int32x4_t mzero = vdupq_n_s32(0);
5235
- #endif
5236
 
5237
  ggml_int8x16x2_t q4bytes;
5238
  ggml_int8x16x2_t q8bytes;
@@ -5269,10 +5103,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
5269
  int32_t sumi2 = 0;
5270
 
5271
  for (int j = 0; j < QK_K/64; ++j) {
5272
-
5273
  const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
5274
 
5275
- #ifdef __ARM_FEATURE_DOTPROD
5276
  q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
5277
  q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
5278
  q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
@@ -5287,26 +5119,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
5287
  const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
5288
 
5289
  sumi2 += vaddvq_s32(p2) * scales[2*j+1];
5290
- #else
5291
- q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
5292
- q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
5293
- q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
5294
- const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
5295
- vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0])));
5296
- const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
5297
- vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
5298
- sumi1 += vaddvq_s16(vaddq_s16(p0, p1)) * scales[2*j+0];
5299
-
5300
- q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
5301
- q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
5302
- q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
5303
- const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
5304
- vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0])));
5305
- const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
5306
- vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
5307
- sumi2 += vaddvq_s16(vaddq_s16(p2, p3)) * scales[2*j+1];
5308
-
5309
- #endif
5310
  }
5311
 
5312
  sumf += d * (sumi1 + sumi2);
@@ -5603,12 +5415,9 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
5603
  const int nb = n / QK_K;
5604
 
5605
  #ifdef __ARM_NEON
5606
-
5607
  const uint8x16_t m4b = vdupq_n_u8(0xf);
5608
 
5609
- #ifdef __ARM_FEATURE_DOTPROD
5610
  const int32x4_t mzero = vdupq_n_s32(0);
5611
- #endif
5612
 
5613
  float sumf = 0;
5614
 
@@ -5636,7 +5445,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
5636
 
5637
  const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
5638
 
5639
- #ifdef __ARM_FEATURE_DOTPROD
5640
  q8bytes = ggml_vld1q_s8_x4(q8);
5641
  q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
5642
  q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
@@ -5650,27 +5458,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
5650
  const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
5651
  const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
5652
 
5653
- #else
5654
- q8bytes = ggml_vld1q_s8_x4(q8);
5655
- q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
5656
- q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
5657
- const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
5658
- vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0])));
5659
- const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
5660
- vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
5661
- int32_t sumi1 = vaddvq_s16(vaddq_s16(p0, p1)) * scales[0];
5662
-
5663
- q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
5664
- q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
5665
- const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[2])),
5666
- vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[2])));
5667
- const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[3])),
5668
- vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[3])));
5669
- int32_t sumi2 = vaddvq_s16(vaddq_s16(p2, p3)) * scales[1];
5670
-
5671
- #endif
5672
  sumf += d * (sumi1 + sumi2);
5673
-
5674
  }
5675
 
5676
  *s = sumf - sum_mins;
@@ -5875,15 +5663,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
5875
 
5876
  uint32_t utmp[4];
5877
 
5878
-
5879
  #ifdef __ARM_NEON
5880
-
5881
  const uint8x16_t m4b = vdupq_n_u8(0xf);
5882
  const uint8x16_t mone = vdupq_n_u8(1);
5883
  const uint8x16_t mtwo = vdupq_n_u8(2);
5884
- #if defined(__ARM_FEATURE_DOTPROD)
5885
  const int32x4_t mzero = vdupq_n_s32(0);
5886
- #endif
5887
 
5888
  ggml_int8x16x4_t q5bytes;
5889
 
@@ -5938,28 +5722,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
5938
  q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
5939
  q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));
5940
 
5941
- #if defined(__ARM_FEATURE_DOTPROD)
5942
-
5943
  sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
5944
  sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
5945
- #else
5946
-
5947
- const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
5948
- vmull_s8(vget_high_s8(q5bytes.val[0]), vget_high_s8(q8bytes.val[0])));
5949
- const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
5950
- vmull_s8(vget_high_s8(q5bytes.val[1]), vget_high_s8(q8bytes.val[1])));
5951
- sumi += vaddvq_s16(vaddq_s16(p0, p1)) * *scales++;
5952
-
5953
- const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
5954
- vmull_s8(vget_high_s8(q5bytes.val[2]), vget_high_s8(q8bytes.val[2])));
5955
- const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
5956
- vmull_s8(vget_high_s8(q5bytes.val[3]), vget_high_s8(q8bytes.val[3])));
5957
- sumi += vaddvq_s16(vaddq_s16(p2, p3)) * *scales++;
5958
- #endif
5959
  }
5960
 
5961
  sumf += d * sumi - dmin * sumi_mins;
5962
-
5963
  }
5964
 
5965
  *s = sumf;
@@ -6311,12 +6078,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
6311
  const int nb = n / QK_K;
6312
 
6313
  #ifdef __ARM_NEON
6314
-
6315
  const uint8x16_t m4b = vdupq_n_u8(0xf);
6316
  const uint8x16_t mh = vdupq_n_u8(16);
6317
- #if defined(__ARM_FEATURE_DOTPROD)
6318
  const int32x4_t mzero = vdupq_n_s32(0);
6319
- #endif
6320
 
6321
  ggml_int8x16x4_t q5bytes;
6322
  ggml_uint8x16x4_t q5h;
@@ -6348,32 +6112,12 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
6348
  q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2]));
6349
  q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3]));
6350
 
6351
- #if defined(__ARM_FEATURE_DOTPROD)
6352
-
6353
  int32_t sumi1 = sc[0] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
6354
  int32_t sumi2 = sc[1] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
6355
  int32_t sumi3 = sc[2] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
6356
  int32_t sumi4 = sc[3] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
6357
 
6358
  sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
6359
-
6360
- #else
6361
-
6362
- const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
6363
- vmull_s8(vget_high_s8(q5bytes.val[0]), vget_high_s8(q8bytes.val[0])));
6364
- const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
6365
- vmull_s8(vget_high_s8(q5bytes.val[1]), vget_high_s8(q8bytes.val[1])));
6366
- int32_t sumi = sc[0] * vaddvq_s16(p0) + sc[1] * vaddvq_s16(p1);
6367
-
6368
- const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
6369
- vmull_s8(vget_high_s8(q5bytes.val[2]), vget_high_s8(q8bytes.val[2])));
6370
- const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
6371
- vmull_s8(vget_high_s8(q5bytes.val[3]), vget_high_s8(q8bytes.val[3])));
6372
- sumi += sc[2] * vaddvq_s16(p2) + sc[3] * vaddvq_s16(p3);
6373
-
6374
- sumf += d*sumi;
6375
- #endif
6376
-
6377
  }
6378
 
6379
  *s = sumf;
@@ -6600,13 +6344,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
6600
  const int nb = n / QK_K;
6601
 
6602
  #ifdef __ARM_NEON
6603
-
6604
  float sum = 0;
6605
 
6606
  const uint8x16_t m4b = vdupq_n_u8(0xF);
6607
- #if defined(__ARM_FEATURE_DOTPROD)
6608
  const int32x4_t vzero = vdupq_n_s32(0);
6609
- #endif
6610
  //const int8x16_t m32s = vdupq_n_s8(32);
6611
 
6612
  const uint8x16_t mone = vdupq_n_u8(3);
@@ -6658,30 +6399,12 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
6658
  q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
6659
  q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
6660
 
6661
- #if defined(__ARM_FEATURE_DOTPROD)
6662
-
6663
  isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
6664
  vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
6665
  vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
6666
  vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
6667
- scale += 4;
6668
 
6669
- #else
6670
-
6671
- int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
6672
- vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0])));
6673
- int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
6674
- vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1])));
6675
- isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1];
6676
- scale += 2;
6677
-
6678
- int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
6679
- vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2])));
6680
- int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
6681
- vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3])));
6682
- isum += vaddvq_s16(p2) * scale[0] + vaddvq_s16(p3) * scale[1];
6683
- scale += 2;
6684
- #endif
6685
 
6686
  q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
6687
 
@@ -6703,34 +6426,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
6703
  q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
6704
  q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
6705
 
6706
- #if defined(__ARM_FEATURE_DOTPROD)
6707
-
6708
  isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
6709
  vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
6710
  vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
6711
  vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
6712
  scale += 4;
6713
-
6714
- //for (int l = 0; l < 4; ++l) {
6715
- // const int32x4_t p = vdotq_s32(vzero, q6bytes.val[l], q8bytes.val[l]);
6716
- // isum += vaddvq_s32(p) * *scale++;
6717
- //}
6718
- #else
6719
- p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
6720
- vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0])));
6721
- p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
6722
- vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1])));
6723
- isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1];
6724
- scale += 2;
6725
-
6726
- p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
6727
- vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2])));
6728
- p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
6729
- vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3])));
6730
- isum += vaddvq_s16(p2) * scale[0] + vaddvq_s16(p3) * scale[1];
6731
- scale += 2;
6732
- #endif
6733
-
6734
  }
6735
  //sum += isum * d_all * y[i].d;
6736
  sum += d_all * y[i].d * (isum - 32 * isum_mins);
@@ -7076,14 +6776,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
7076
  const int nb = n / QK_K;
7077
 
7078
  #ifdef __ARM_NEON
7079
-
7080
  float sum = 0;
7081
 
7082
  const uint8x16_t m4b = vdupq_n_u8(0xF);
7083
  const int8x16_t m32s = vdupq_n_s8(32);
7084
- #if defined(__ARM_FEATURE_DOTPROD)
7085
  const int32x4_t vzero = vdupq_n_s32(0);
7086
- #endif
7087
 
7088
  const uint8x16_t mone = vdupq_n_u8(3);
7089
 
@@ -7119,26 +6816,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
7119
  q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s);
7120
  q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s);
7121
 
7122
- #if defined(__ARM_FEATURE_DOTPROD)
7123
-
7124
  isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
7125
  vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
7126
  vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
7127
  vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
7128
- #else
7129
-
7130
- int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
7131
- vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0])));
7132
- int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
7133
- vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1])));
7134
- isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1];
7135
-
7136
- int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
7137
- vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2])));
7138
- int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
7139
- vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3])));
7140
- isum += vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3];
7141
- #endif
7142
 
7143
  sum += isum * d_all * y[i].d;
7144
 
 
407
  #define ggml_vld1q_s8_x4 vld1q_s8_x4
408
 
409
  #endif
410
+
411
+ #if !defined(__ARM_FEATURE_DOTPROD)
412
+
413
+ inline static int32x4_t vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
414
+ const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
415
+ const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
416
+
417
+ return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
418
+ }
419
+
420
+ #endif
421
+
422
  #endif
423
 
424
  #if defined(__ARM_NEON) || defined(__wasm_simd128__)
 
2480
  const int8x16_t v1_1l = vld1q_s8(y1->qs);
2481
  const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
2482
 
 
2483
  // dot product into int32x4_t
2484
  const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
2485
  const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
2486
 
2487
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
2488
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2489
  }
2490
 
2491
  *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
 
2768
  const int8x16_t v1_1l = vld1q_s8(y1->qs);
2769
  const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
2770
 
 
2771
  // dot product into int32x4_t
2772
  const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
2773
  const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
2774
 
2775
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
2776
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2777
  }
2778
 
2779
  *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
 
2935
  const int8x16_t v1_1l = vld1q_s8(y1->qs);
2936
  const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
2937
 
 
2938
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
2939
  vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
2940
  vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
2941
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
2942
  vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
2943
  vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2944
  }
2945
 
2946
  *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
 
3227
  const int8x16_t v1_1l = vld1q_s8(y1->qs);
3228
  const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
3229
 
 
3230
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
3231
  vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
3232
  vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
3233
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
3234
  vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
3235
  vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3236
  }
3237
 
3238
  *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
 
3482
  const int8x16_t y1_0 = vld1q_s8(y1->qs);
3483
  const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
3484
 
 
3485
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
3486
  vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
3487
  vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
 
3489
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
3490
  vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
3491
  vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3492
  }
3493
 
3494
  *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
 
3561
  const int nb = n / QK_K;
3562
 
3563
  #ifdef __ARM_NEON
 
3564
  const uint8x16_t m3 = vdupq_n_u8(0x3);
3565
  const uint8x16_t m4 = vdupq_n_u8(0xF);
3566
+
3567
+ const int32x4_t vzero = vdupq_n_s32(0);
 
3568
 
3569
  ggml_int8x16x2_t q2bytes;
3570
  uint8_t aux[16];
 
3572
  float sum = 0;
3573
 
3574
  for (int i = 0; i < nb; ++i) {
 
3575
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
3576
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
3577
 
 
3597
 
3598
  // We use this macro instead of a function call because for some reason
3599
  // the code runs 2-3% slower, even if the function is declared inline
 
3600
  #define MULTIPLY_ACCUM_WITH_SCALE(index)\
3601
  isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
3602
  isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
 
 
 
 
 
 
 
 
 
 
3603
 
3604
  #define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
3605
  q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
 
3607
  q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
3608
  MULTIPLY_ACCUM_WITH_SCALE((index));
3609
 
 
3610
  for (int j = 0; j < QK_K/128; ++j) {
 
3611
  const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32;
3612
 
3613
  ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
3614
  q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
3615
  q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
3616
+
3617
  MULTIPLY_ACCUM_WITH_SCALE(0);
3618
 
3619
  SHIFT_MULTIPLY_ACCUM_WITH_SCALE(2, 2);
 
3620
  SHIFT_MULTIPLY_ACCUM_WITH_SCALE(4, 4);
 
3621
  SHIFT_MULTIPLY_ACCUM_WITH_SCALE(6, 6);
3622
 
3623
  is += 8;
3624
  }
 
3625
 
3626
+ sum += d * isum;
3627
  }
3628
 
3629
  *s = sum;
 
3937
  const int nb = n / QK_K;
3938
 
3939
  #ifdef __ARM_NEON
 
3940
  const uint8x16_t m3 = vdupq_n_u8(0x3);
3941
+
3942
+ const int32x4_t vzero = vdupq_n_s32(0);
 
3943
 
3944
  ggml_int8x16x4_t q2bytes;
3945
 
 
3973
  q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3));
3974
  q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3));
3975
 
 
3976
  isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
3977
  isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
3978
  isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
3979
  isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3980
 
3981
+ sum += d * (isum1 + isum2);
3982
  }
3983
 
3984
  *s = sum;
 
4204
  uint32_t utmp[4];
4205
 
4206
  const uint8x16_t m3b = vdupq_n_u8(0x3);
 
4207
  const int32x4_t vzero = vdupq_n_s32(0);
 
4208
 
4209
  const uint8x16_t m0 = vdupq_n_u8(1);
4210
  const uint8x16_t m1 = vshlq_n_u8(m0, 1);
 
4256
  q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
4257
  q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
4258
 
 
4259
  isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
4260
  isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
4261
  isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
4262
  isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
4263
+
 
 
 
 
 
 
 
 
 
 
4264
  scale += 4;
4265
 
4266
  q3h.val[0] = vbicq_u8(m2, qhbits.val[0]);
 
4273
  q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
4274
  q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
4275
 
 
4276
  isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
4277
  isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
4278
  isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
4279
  isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
4280
+
 
 
 
 
 
 
 
 
 
 
4281
  scale += 4;
4282
 
4283
  if (j == 0) {
 
4716
  const int nb = n / QK_K;
4717
 
4718
  #ifdef __ARM_NEON
4719
+ const int32x4_t vzero = vdupq_n_s32(0);
 
 
 
4720
 
4721
  const uint8x16_t m3b = vdupq_n_u8(0x3);
4722
  const uint8x16_t mh = vdupq_n_u8(4);
 
4757
  q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2]));
4758
  q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6), q3h.val[3]));
4759
 
 
4760
  isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
4761
  isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
4762
  isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
4763
  isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
 
 
 
 
 
 
 
 
 
 
 
4764
 
4765
  sum += d * isum;
4766
 
 
5065
  uint32_t utmp[4];
5066
 
5067
  #ifdef __ARM_NEON
 
5068
  const uint8x16_t m4b = vdupq_n_u8(0xf);
 
5069
  const int32x4_t mzero = vdupq_n_s32(0);
 
5070
 
5071
  ggml_int8x16x2_t q4bytes;
5072
  ggml_int8x16x2_t q8bytes;
 
5103
  int32_t sumi2 = 0;
5104
 
5105
  for (int j = 0; j < QK_K/64; ++j) {
 
5106
  const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
5107
 
 
5108
  q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
5109
  q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
5110
  q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
 
5119
  const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
5120
 
5121
  sumi2 += vaddvq_s32(p2) * scales[2*j+1];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5122
  }
5123
 
5124
  sumf += d * (sumi1 + sumi2);
 
5415
  const int nb = n / QK_K;
5416
 
5417
  #ifdef __ARM_NEON
 
5418
  const uint8x16_t m4b = vdupq_n_u8(0xf);
5419
 
 
5420
  const int32x4_t mzero = vdupq_n_s32(0);
 
5421
 
5422
  float sumf = 0;
5423
 
 
5445
 
5446
  const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
5447
 
 
5448
  q8bytes = ggml_vld1q_s8_x4(q8);
5449
  q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
5450
  q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
 
5458
  const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
5459
  const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
5460
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5461
  sumf += d * (sumi1 + sumi2);
 
5462
  }
5463
 
5464
  *s = sumf - sum_mins;
 
5663
 
5664
  uint32_t utmp[4];
5665
 
 
5666
  #ifdef __ARM_NEON
 
5667
  const uint8x16_t m4b = vdupq_n_u8(0xf);
5668
  const uint8x16_t mone = vdupq_n_u8(1);
5669
  const uint8x16_t mtwo = vdupq_n_u8(2);
 
5670
  const int32x4_t mzero = vdupq_n_s32(0);
 
5671
 
5672
  ggml_int8x16x4_t q5bytes;
5673
 
 
5722
  q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
5723
  q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));
5724
 
 
 
5725
  sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
5726
  sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5727
  }
5728
 
5729
  sumf += d * sumi - dmin * sumi_mins;
 
5730
  }
5731
 
5732
  *s = sumf;
 
6078
  const int nb = n / QK_K;
6079
 
6080
  #ifdef __ARM_NEON
 
6081
  const uint8x16_t m4b = vdupq_n_u8(0xf);
6082
  const uint8x16_t mh = vdupq_n_u8(16);
 
6083
  const int32x4_t mzero = vdupq_n_s32(0);
 
6084
 
6085
  ggml_int8x16x4_t q5bytes;
6086
  ggml_uint8x16x4_t q5h;
 
6112
  q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2]));
6113
  q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3]));
6114
 
 
 
6115
  int32_t sumi1 = sc[0] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
6116
  int32_t sumi2 = sc[1] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
6117
  int32_t sumi3 = sc[2] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
6118
  int32_t sumi4 = sc[3] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
6119
 
6120
  sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6121
  }
6122
 
6123
  *s = sumf;
 
6344
  const int nb = n / QK_K;
6345
 
6346
  #ifdef __ARM_NEON
 
6347
  float sum = 0;
6348
 
6349
  const uint8x16_t m4b = vdupq_n_u8(0xF);
 
6350
  const int32x4_t vzero = vdupq_n_s32(0);
 
6351
  //const int8x16_t m32s = vdupq_n_s8(32);
6352
 
6353
  const uint8x16_t mone = vdupq_n_u8(3);
 
6399
  q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
6400
  q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
6401
 
 
 
6402
  isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
6403
  vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
6404
  vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
6405
  vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
 
6406
 
6407
+ scale += 4;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6408
 
6409
  q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
6410
 
 
6426
  q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
6427
  q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
6428
 
 
 
6429
  isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
6430
  vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
6431
  vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
6432
  vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
6433
  scale += 4;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6434
  }
6435
  //sum += isum * d_all * y[i].d;
6436
  sum += d_all * y[i].d * (isum - 32 * isum_mins);
 
6776
  const int nb = n / QK_K;
6777
 
6778
  #ifdef __ARM_NEON
 
6779
  float sum = 0;
6780
 
6781
  const uint8x16_t m4b = vdupq_n_u8(0xF);
6782
  const int8x16_t m32s = vdupq_n_s8(32);
 
6783
  const int32x4_t vzero = vdupq_n_s32(0);
 
6784
 
6785
  const uint8x16_t mone = vdupq_n_u8(3);
6786
 
 
6816
  q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s);
6817
  q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s);
6818
 
 
 
6819
  isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
6820
  vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
6821
  vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
6822
  vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6823
 
6824
  sum += isum * d_all * y[i].d;
6825
 
ggml.c CHANGED
@@ -4041,7 +4041,6 @@ static struct ggml_tensor * ggml_group_norm_impl(
4041
  result->op = GGML_OP_GROUP_NORM;
4042
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4043
  result->src[0] = a;
4044
- result->src[1] = NULL; // TODO: maybe store epsilon here?
4045
 
4046
  return result;
4047
  }
@@ -5541,7 +5540,6 @@ static struct ggml_tensor * ggml_upscale_impl(
5541
  result->op_params[0] = scale_factor;
5542
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5543
  result->src[0] = a;
5544
- result->src[1] = NULL;
5545
 
5546
  return result;
5547
  }
@@ -5846,7 +5844,6 @@ struct ggml_tensor * ggml_get_rel_pos(
5846
  result->op = GGML_OP_GET_REL_POS;
5847
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5848
  result->src[0] = a;
5849
- result->src[1] = NULL;
5850
 
5851
  return result;
5852
  }
@@ -9690,7 +9687,7 @@ static void ggml_compute_forward_mul_mat(
9690
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
9691
 
9692
  assert(params->wsize >= ne11*ne12*ne13*row_size);
9693
- assert(src1->type == GGML_TYPE_F32);
9694
 
9695
  for (int64_t i13 = 0; i13 < ne13; ++i13) {
9696
  for (int64_t i12 = 0; i12 < ne12; ++i12) {
@@ -17456,9 +17453,9 @@ static void ggml_opt_acc_grad(int np, struct ggml_tensor * const ps[], float * g
17456
  }
17457
 
17458
  //
17459
- // ADAM
17460
  //
17461
- // ref: https://arxiv.org/pdf/1412.6980.pdf
17462
  //
17463
 
17464
  static enum ggml_opt_result ggml_opt_adam(
@@ -19351,7 +19348,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
19351
  data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
19352
  }
19353
  gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
19354
- free(data);
19355
  } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
19356
  GGML_ASSERT(false && "nested arrays not supported");
19357
  } else {
 
4041
  result->op = GGML_OP_GROUP_NORM;
4042
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
4043
  result->src[0] = a;
 
4044
 
4045
  return result;
4046
  }
 
5540
  result->op_params[0] = scale_factor;
5541
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5542
  result->src[0] = a;
 
5543
 
5544
  return result;
5545
  }
 
5844
  result->op = GGML_OP_GET_REL_POS;
5845
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5846
  result->src[0] = a;
 
5847
 
5848
  return result;
5849
  }
 
9687
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
9688
 
9689
  assert(params->wsize >= ne11*ne12*ne13*row_size);
9690
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
9691
 
9692
  for (int64_t i13 = 0; i13 < ne13; ++i13) {
9693
  for (int64_t i12 = 0; i12 < ne12; ++i12) {
 
17453
  }
17454
 
17455
  //
17456
+ // Using AdamW - ref: https://arxiv.org/pdf/1711.05101v3.pdf
17457
  //
17458
+ // (Original Adam - ref: https://arxiv.org/pdf/1412.6980.pdf)
17459
  //
17460
 
17461
  static enum ggml_opt_result ggml_opt_adam(
 
19348
  data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
19349
  }
19350
  gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
19351
+ free((void *)data);
19352
  } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
19353
  GGML_ASSERT(false && "nested arrays not supported");
19354
  } else {
ggml.h CHANGED
@@ -255,6 +255,8 @@
255
  #define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
256
  #elif defined(__GNUC__)
257
  #define GGML_UNREACHABLE() __builtin_unreachable()
 
 
258
  #else
259
  #define GGML_UNREACHABLE() ((void) 0)
260
  #endif
 
255
  #define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
256
  #elif defined(__GNUC__)
257
  #define GGML_UNREACHABLE() __builtin_unreachable()
258
+ #elif defined(_MSC_VER)
259
+ #define GGML_UNREACHABLE() __assume(0)
260
  #else
261
  #define GGML_UNREACHABLE() ((void) 0)
262
  #endif