Spaces:
Sleeping
Sleeping
talk-llama : sync llama.cpp
Browse files- examples/talk-llama/llama.cpp +272 -10
- examples/talk-llama/llama.h +4 -1
examples/talk-llama/llama.cpp
CHANGED
|
@@ -11,6 +11,10 @@
|
|
| 11 |
# include "ggml-cuda.h"
|
| 12 |
#elif defined(GGML_USE_CLBLAST)
|
| 13 |
# include "ggml-opencl.h"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
#endif
|
| 15 |
|
| 16 |
#ifdef GGML_USE_METAL
|
|
@@ -52,6 +56,7 @@
|
|
| 52 |
#include <algorithm>
|
| 53 |
#include <array>
|
| 54 |
#include <cassert>
|
|
|
|
| 55 |
#include <cinttypes>
|
| 56 |
#include <climits>
|
| 57 |
#include <cmath>
|
|
@@ -196,6 +201,7 @@ enum llm_arch {
|
|
| 196 |
LLM_ARCH_PHI2,
|
| 197 |
LLM_ARCH_PLAMO,
|
| 198 |
LLM_ARCH_CODESHELL,
|
|
|
|
| 199 |
LLM_ARCH_UNKNOWN,
|
| 200 |
};
|
| 201 |
|
|
@@ -217,6 +223,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
| 217 |
{ LLM_ARCH_PHI2, "phi2" },
|
| 218 |
{ LLM_ARCH_PLAMO, "plamo" },
|
| 219 |
{ LLM_ARCH_CODESHELL, "codeshell" },
|
|
|
|
| 220 |
};
|
| 221 |
|
| 222 |
enum llm_kv {
|
|
@@ -641,6 +648,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
| 641 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 642 |
},
|
| 643 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 644 |
|
| 645 |
{
|
| 646 |
LLM_ARCH_UNKNOWN,
|
|
@@ -1256,8 +1282,14 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
|
| 1256 |
if (host_buffer) {
|
| 1257 |
buft = ggml_backend_cuda_host_buffer_type();
|
| 1258 |
}
|
|
|
|
|
|
|
| 1259 |
#elif defined(GGML_USE_CPU_HBM)
|
| 1260 |
buft = ggml_backend_cpu_hbm_buffer_type();
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1261 |
#endif
|
| 1262 |
|
| 1263 |
if (buft == nullptr) {
|
|
@@ -1275,6 +1307,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|
| 1275 |
buft = ggml_backend_metal_buffer_type();
|
| 1276 |
#elif defined(GGML_USE_CUBLAS)
|
| 1277 |
buft = ggml_backend_cuda_buffer_type(gpu);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1278 |
#elif defined(GGML_USE_CLBLAST)
|
| 1279 |
buft = ggml_backend_opencl_buffer_type();
|
| 1280 |
#endif
|
|
@@ -1332,6 +1368,7 @@ enum e_model {
|
|
| 1332 |
MODEL_7B,
|
| 1333 |
MODEL_8B,
|
| 1334 |
MODEL_13B,
|
|
|
|
| 1335 |
MODEL_15B,
|
| 1336 |
MODEL_30B,
|
| 1337 |
MODEL_34B,
|
|
@@ -2683,6 +2720,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
| 2683 |
case MODEL_7B: return "7B";
|
| 2684 |
case MODEL_8B: return "8B";
|
| 2685 |
case MODEL_13B: return "13B";
|
|
|
|
| 2686 |
case MODEL_15B: return "15B";
|
| 2687 |
case MODEL_30B: return "30B";
|
| 2688 |
case MODEL_34B: return "34B";
|
|
@@ -2950,7 +2988,15 @@ static void llm_load_hparams(
|
|
| 2950 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 2951 |
}
|
| 2952 |
} break;
|
|
|
|
|
|
|
|
|
|
| 2953 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2954 |
default: (void)0;
|
| 2955 |
}
|
| 2956 |
|
|
@@ -3933,6 +3979,38 @@ static bool llm_load_tensors(
|
|
| 3933 |
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
| 3934 |
}
|
| 3935 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3936 |
default:
|
| 3937 |
throw std::runtime_error("unknown architecture");
|
| 3938 |
}
|
|
@@ -4563,6 +4641,126 @@ struct llm_build_context {
|
|
| 4563 |
ctx0 = nullptr;
|
| 4564 |
}
|
| 4565 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4566 |
|
| 4567 |
struct ggml_cgraph * build_llama() {
|
| 4568 |
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
@@ -6520,6 +6718,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
| 6520 |
{
|
| 6521 |
result = llm.build_codeshell();
|
| 6522 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6523 |
default:
|
| 6524 |
GGML_ASSERT(false);
|
| 6525 |
}
|
|
@@ -6652,7 +6854,7 @@ static int llama_decode_internal(
|
|
| 6652 |
}
|
| 6653 |
|
| 6654 |
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
|
| 6655 |
-
if (ggml_cpu_has_cublas() && fully_offloaded) {
|
| 6656 |
n_threads = 1;
|
| 6657 |
}
|
| 6658 |
|
|
@@ -7946,6 +8148,11 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c
|
|
| 7946 |
}
|
| 7947 |
|
| 7948 |
void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7949 |
const int64_t t_start_sample_us = ggml_time_us();
|
| 7950 |
|
| 7951 |
k = std::max(k, (int) min_keep);
|
|
@@ -8054,21 +8261,56 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can
|
|
| 8054 |
return;
|
| 8055 |
}
|
| 8056 |
|
| 8057 |
-
llama_sample_softmax(ctx, candidates);
|
| 8058 |
-
|
| 8059 |
const int64_t t_start_sample_us = ggml_time_us();
|
| 8060 |
|
| 8061 |
-
|
| 8062 |
-
|
|
|
|
|
|
|
|
|
|
| 8063 |
|
| 8064 |
-
|
| 8065 |
-
|
| 8066 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8067 |
}
|
| 8068 |
}
|
| 8069 |
|
| 8070 |
-
//
|
| 8071 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8072 |
|
| 8073 |
if (ctx) {
|
| 8074 |
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
@@ -9997,6 +10239,26 @@ struct llama_context * llama_new_context_with_model(
|
|
| 9997 |
}
|
| 9998 |
}
|
| 9999 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10000 |
#endif
|
| 10001 |
ctx->backend_cpu = ggml_backend_cpu_init();
|
| 10002 |
if (ctx->backend_cpu == nullptr) {
|
|
|
|
| 11 |
# include "ggml-cuda.h"
|
| 12 |
#elif defined(GGML_USE_CLBLAST)
|
| 13 |
# include "ggml-opencl.h"
|
| 14 |
+
#elif defined(GGML_USE_VULKAN)
|
| 15 |
+
# include "ggml-vulkan.h"
|
| 16 |
+
#elif defined(GGML_USE_SYCL)
|
| 17 |
+
# include "ggml-sycl.h"
|
| 18 |
#endif
|
| 19 |
|
| 20 |
#ifdef GGML_USE_METAL
|
|
|
|
| 56 |
#include <algorithm>
|
| 57 |
#include <array>
|
| 58 |
#include <cassert>
|
| 59 |
+
#include <cfloat>
|
| 60 |
#include <cinttypes>
|
| 61 |
#include <climits>
|
| 62 |
#include <cmath>
|
|
|
|
| 201 |
LLM_ARCH_PHI2,
|
| 202 |
LLM_ARCH_PLAMO,
|
| 203 |
LLM_ARCH_CODESHELL,
|
| 204 |
+
LLM_ARCH_ORION,
|
| 205 |
LLM_ARCH_UNKNOWN,
|
| 206 |
};
|
| 207 |
|
|
|
|
| 223 |
{ LLM_ARCH_PHI2, "phi2" },
|
| 224 |
{ LLM_ARCH_PLAMO, "plamo" },
|
| 225 |
{ LLM_ARCH_CODESHELL, "codeshell" },
|
| 226 |
+
{ LLM_ARCH_ORION, "orion" },
|
| 227 |
};
|
| 228 |
|
| 229 |
enum llm_kv {
|
|
|
|
| 648 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 649 |
},
|
| 650 |
},
|
| 651 |
+
{
|
| 652 |
+
LLM_ARCH_ORION,
|
| 653 |
+
{
|
| 654 |
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
| 655 |
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
| 656 |
+
{ LLM_TENSOR_OUTPUT, "output" },
|
| 657 |
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
| 658 |
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
| 659 |
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
| 660 |
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
| 661 |
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
| 662 |
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
| 663 |
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
| 664 |
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
| 665 |
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
| 666 |
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
| 667 |
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
| 668 |
+
},
|
| 669 |
+
},
|
| 670 |
|
| 671 |
{
|
| 672 |
LLM_ARCH_UNKNOWN,
|
|
|
|
| 1282 |
if (host_buffer) {
|
| 1283 |
buft = ggml_backend_cuda_host_buffer_type();
|
| 1284 |
}
|
| 1285 |
+
#elif defined(GGML_USE_SYCL)
|
| 1286 |
+
buft = ggml_backend_sycl_host_buffer_type();
|
| 1287 |
#elif defined(GGML_USE_CPU_HBM)
|
| 1288 |
buft = ggml_backend_cpu_hbm_buffer_type();
|
| 1289 |
+
#elif defined(GGML_USE_VULKAN)
|
| 1290 |
+
if (host_buffer) {
|
| 1291 |
+
buft = ggml_backend_vk_host_buffer_type();
|
| 1292 |
+
}
|
| 1293 |
#endif
|
| 1294 |
|
| 1295 |
if (buft == nullptr) {
|
|
|
|
| 1307 |
buft = ggml_backend_metal_buffer_type();
|
| 1308 |
#elif defined(GGML_USE_CUBLAS)
|
| 1309 |
buft = ggml_backend_cuda_buffer_type(gpu);
|
| 1310 |
+
#elif defined(GGML_USE_VULKAN)
|
| 1311 |
+
buft = ggml_backend_vk_buffer_type();
|
| 1312 |
+
#elif defined(GGML_USE_SYCL)
|
| 1313 |
+
buft = ggml_backend_sycl_buffer_type(gpu);
|
| 1314 |
#elif defined(GGML_USE_CLBLAST)
|
| 1315 |
buft = ggml_backend_opencl_buffer_type();
|
| 1316 |
#endif
|
|
|
|
| 1368 |
MODEL_7B,
|
| 1369 |
MODEL_8B,
|
| 1370 |
MODEL_13B,
|
| 1371 |
+
MODEL_14B,
|
| 1372 |
MODEL_15B,
|
| 1373 |
MODEL_30B,
|
| 1374 |
MODEL_34B,
|
|
|
|
| 2720 |
case MODEL_7B: return "7B";
|
| 2721 |
case MODEL_8B: return "8B";
|
| 2722 |
case MODEL_13B: return "13B";
|
| 2723 |
+
case MODEL_14B: return "14B";
|
| 2724 |
case MODEL_15B: return "15B";
|
| 2725 |
case MODEL_30B: return "30B";
|
| 2726 |
case MODEL_34B: return "34B";
|
|
|
|
| 2988 |
default: model.type = e_model::MODEL_UNKNOWN;
|
| 2989 |
}
|
| 2990 |
} break;
|
| 2991 |
+
case LLM_ARCH_ORION:
|
| 2992 |
+
{
|
| 2993 |
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
| 2994 |
|
| 2995 |
+
switch (hparams.n_layer) {
|
| 2996 |
+
case 40: model.type = e_model::MODEL_14B; break;
|
| 2997 |
+
default: model.type = e_model::MODEL_UNKNOWN;
|
| 2998 |
+
}
|
| 2999 |
+
} break;
|
| 3000 |
default: (void)0;
|
| 3001 |
}
|
| 3002 |
|
|
|
|
| 3979 |
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
| 3980 |
}
|
| 3981 |
} break;
|
| 3982 |
+
case LLM_ARCH_ORION:
|
| 3983 |
+
{
|
| 3984 |
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
| 3985 |
+
{
|
| 3986 |
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
| 3987 |
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
| 3988 |
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
| 3989 |
+
}
|
| 3990 |
+
for (int i = 0; i < n_layer; ++i) {
|
| 3991 |
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
| 3992 |
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
| 3993 |
+
|
| 3994 |
+
auto & layer = model.layers[i];
|
| 3995 |
+
|
| 3996 |
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
| 3997 |
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
| 3998 |
+
|
| 3999 |
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
| 4000 |
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
| 4001 |
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
| 4002 |
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
| 4003 |
+
|
| 4004 |
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
| 4005 |
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
| 4006 |
+
|
| 4007 |
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
| 4008 |
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
| 4009 |
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
| 4010 |
+
}
|
| 4011 |
+
} break;
|
| 4012 |
+
|
| 4013 |
+
|
| 4014 |
default:
|
| 4015 |
throw std::runtime_error("unknown architecture");
|
| 4016 |
}
|
|
|
|
| 4641 |
ctx0 = nullptr;
|
| 4642 |
}
|
| 4643 |
}
|
| 4644 |
+
struct ggml_cgraph * build_orion() {
|
| 4645 |
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
| 4646 |
+
|
| 4647 |
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
| 4648 |
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
| 4649 |
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
| 4650 |
+
|
| 4651 |
+
struct ggml_tensor * cur;
|
| 4652 |
+
struct ggml_tensor * inpL;
|
| 4653 |
+
|
| 4654 |
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
| 4655 |
+
cb(inpL, "inp_embd", -1);
|
| 4656 |
+
|
| 4657 |
+
// inp_pos - contains the positions
|
| 4658 |
+
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
| 4659 |
+
cb(inp_pos, "inp_pos", -1);
|
| 4660 |
+
|
| 4661 |
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
| 4662 |
+
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
| 4663 |
+
cb(KQ_mask, "KQ_mask", -1);
|
| 4664 |
+
|
| 4665 |
+
// shift the entire K-cache if needed
|
| 4666 |
+
if (do_rope_shift) {
|
| 4667 |
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
| 4668 |
+
}
|
| 4669 |
+
|
| 4670 |
+
for (int il = 0; il < n_layer; ++il) {
|
| 4671 |
+
struct ggml_tensor * inpSA = inpL;
|
| 4672 |
+
|
| 4673 |
+
// norm
|
| 4674 |
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
| 4675 |
+
model.layers[il].attn_norm, model.layers[il].attn_norm_b,
|
| 4676 |
+
LLM_NORM, cb, il);
|
| 4677 |
+
cb(cur, "attn_norm", il);
|
| 4678 |
+
|
| 4679 |
+
// self-attention
|
| 4680 |
+
{
|
| 4681 |
+
// compute Q and K and RoPE them
|
| 4682 |
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
| 4683 |
+
cb(Qcur, "Qcur", il);
|
| 4684 |
+
// if (model.layers[il].bq) {
|
| 4685 |
+
// Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
| 4686 |
+
// cb(Qcur, "Qcur", il);
|
| 4687 |
+
// }
|
| 4688 |
+
|
| 4689 |
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
| 4690 |
+
cb(Kcur, "Kcur", il);
|
| 4691 |
+
// if (model.layers[il].bk) {
|
| 4692 |
+
// Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
| 4693 |
+
// cb(Kcur, "Kcur", il);
|
| 4694 |
+
// }
|
| 4695 |
+
|
| 4696 |
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
| 4697 |
+
cb(Vcur, "Vcur", il);
|
| 4698 |
+
// if (model.layers[il].bv) {
|
| 4699 |
+
// Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
| 4700 |
+
// cb(Vcur, "Vcur", il);
|
| 4701 |
+
// }
|
| 4702 |
+
|
| 4703 |
+
Qcur = ggml_rope_custom(
|
| 4704 |
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
| 4705 |
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
| 4706 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 4707 |
+
);
|
| 4708 |
+
cb(Qcur, "Qcur", il);
|
| 4709 |
+
|
| 4710 |
+
Kcur = ggml_rope_custom(
|
| 4711 |
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
| 4712 |
+
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
| 4713 |
+
ext_factor, attn_factor, beta_fast, beta_slow
|
| 4714 |
+
);
|
| 4715 |
+
cb(Kcur, "Kcur", il);
|
| 4716 |
+
|
| 4717 |
+
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
| 4718 |
+
model.layers[il].wo, NULL,
|
| 4719 |
+
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
| 4720 |
+
cb(cur, "kqv_out", il);
|
| 4721 |
+
}
|
| 4722 |
+
|
| 4723 |
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
| 4724 |
+
cb(ffn_inp, "ffn_inp", il);
|
| 4725 |
+
|
| 4726 |
+
// feed-forward network
|
| 4727 |
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
| 4728 |
+
model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
|
| 4729 |
+
LLM_NORM, cb, il);
|
| 4730 |
+
cb(cur, "ffn_norm", il);
|
| 4731 |
+
|
| 4732 |
+
cur = llm_build_ffn(ctx0, cur,
|
| 4733 |
+
model.layers[il].ffn_up, NULL,
|
| 4734 |
+
model.layers[il].ffn_gate, NULL,
|
| 4735 |
+
model.layers[il].ffn_down, NULL,
|
| 4736 |
+
NULL,
|
| 4737 |
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
| 4738 |
+
cb(cur, "ffn_out", il);
|
| 4739 |
+
|
| 4740 |
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
| 4741 |
+
cb(cur, "l_out", il);
|
| 4742 |
+
|
| 4743 |
+
// input for next layer
|
| 4744 |
+
inpL = cur;
|
| 4745 |
+
}
|
| 4746 |
+
|
| 4747 |
+
cur = inpL;
|
| 4748 |
+
|
| 4749 |
+
cur = llm_build_norm(ctx0, cur, hparams,
|
| 4750 |
+
model.output_norm, model.output_norm_b,
|
| 4751 |
+
LLM_NORM, cb, -1);
|
| 4752 |
+
cb(cur, "result_norm", -1);
|
| 4753 |
+
|
| 4754 |
+
// lm_head
|
| 4755 |
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
| 4756 |
+
cb(cur, "result_output", -1);
|
| 4757 |
+
|
| 4758 |
+
ggml_build_forward_expand(gf, cur);
|
| 4759 |
+
|
| 4760 |
+
return gf;
|
| 4761 |
+
}
|
| 4762 |
+
|
| 4763 |
+
|
| 4764 |
|
| 4765 |
struct ggml_cgraph * build_llama() {
|
| 4766 |
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
|
|
| 6718 |
{
|
| 6719 |
result = llm.build_codeshell();
|
| 6720 |
} break;
|
| 6721 |
+
case LLM_ARCH_ORION:
|
| 6722 |
+
{
|
| 6723 |
+
result = llm.build_orion();
|
| 6724 |
+
} break;
|
| 6725 |
default:
|
| 6726 |
GGML_ASSERT(false);
|
| 6727 |
}
|
|
|
|
| 6854 |
}
|
| 6855 |
|
| 6856 |
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
|
| 6857 |
+
if ((ggml_cpu_has_cublas() || ggml_cpu_has_vulkan()) && fully_offloaded) {
|
| 6858 |
n_threads = 1;
|
| 6859 |
}
|
| 6860 |
|
|
|
|
| 8148 |
}
|
| 8149 |
|
| 8150 |
void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
|
| 8151 |
+
// TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
|
| 8152 |
+
// if (k >= (int32_t)candidates->size) {
|
| 8153 |
+
// return;
|
| 8154 |
+
// }
|
| 8155 |
+
|
| 8156 |
const int64_t t_start_sample_us = ggml_time_us();
|
| 8157 |
|
| 8158 |
k = std::max(k, (int) min_keep);
|
|
|
|
| 8261 |
return;
|
| 8262 |
}
|
| 8263 |
|
|
|
|
|
|
|
| 8264 |
const int64_t t_start_sample_us = ggml_time_us();
|
| 8265 |
|
| 8266 |
+
bool min_p_applied = false;
|
| 8267 |
+
|
| 8268 |
+
// if the candidates aren't sorted, try the unsorted implementation first
|
| 8269 |
+
if (!candidates->sorted) {
|
| 8270 |
+
std::vector<llama_token_data> filtered_tokens;
|
| 8271 |
|
| 8272 |
+
float max_logit = -FLT_MAX;
|
| 8273 |
+
for (size_t i = 0; i < candidates->size; ++i) {
|
| 8274 |
+
max_logit = std::max(max_logit, candidates->data[i].logit);
|
| 8275 |
+
}
|
| 8276 |
+
const float min_logit = max_logit + logf(p); // min logit for p_i >= p * p_max
|
| 8277 |
+
|
| 8278 |
+
for (size_t i = 0; i < candidates->size; ++i) {
|
| 8279 |
+
if (candidates->data[i].logit >= min_logit) {
|
| 8280 |
+
filtered_tokens.push_back(candidates->data[i]);
|
| 8281 |
+
}
|
| 8282 |
+
}
|
| 8283 |
+
|
| 8284 |
+
// if we have enough values the operation was a success
|
| 8285 |
+
if (filtered_tokens.size() >= min_keep) {
|
| 8286 |
+
memcpy(candidates->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
|
| 8287 |
+
candidates->size = filtered_tokens.size();
|
| 8288 |
+
min_p_applied = true;
|
| 8289 |
}
|
| 8290 |
}
|
| 8291 |
|
| 8292 |
+
// if the candidates are sorted or the unsorted implementation failed, use this implementation
|
| 8293 |
+
if (!min_p_applied) {
|
| 8294 |
+
// Sort the logits in descending order
|
| 8295 |
+
if (!candidates->sorted) {
|
| 8296 |
+
std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
| 8297 |
+
return a.logit > b.logit;
|
| 8298 |
+
});
|
| 8299 |
+
candidates->sorted = true;
|
| 8300 |
+
}
|
| 8301 |
+
|
| 8302 |
+
const float min_logit = candidates->data[0].logit + logf(p); // min logit for p_i >= p * p_max
|
| 8303 |
+
size_t i = 1; // first token always matches
|
| 8304 |
+
|
| 8305 |
+
for (; i < candidates->size; ++i) {
|
| 8306 |
+
if (candidates->data[i].logit < min_logit && i >= min_keep) {
|
| 8307 |
+
break; // prob too small
|
| 8308 |
+
}
|
| 8309 |
+
}
|
| 8310 |
+
|
| 8311 |
+
// Resize the output vector to keep only the matching tokens
|
| 8312 |
+
candidates->size = i;
|
| 8313 |
+
}
|
| 8314 |
|
| 8315 |
if (ctx) {
|
| 8316 |
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
|
|
| 10239 |
}
|
| 10240 |
}
|
| 10241 |
}
|
| 10242 |
+
#elif defined(GGML_USE_VULKAN)
|
| 10243 |
+
if (model->n_gpu_layers > 0) {
|
| 10244 |
+
ggml_backend_t backend = ggml_backend_vk_init();
|
| 10245 |
+
if (backend == nullptr) {
|
| 10246 |
+
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
| 10247 |
+
llama_free(ctx);
|
| 10248 |
+
return nullptr;
|
| 10249 |
+
}
|
| 10250 |
+
ctx->backends.push_back(backend);
|
| 10251 |
+
}
|
| 10252 |
+
#elif defined(GGML_USE_SYCL)
|
| 10253 |
+
if (model->n_gpu_layers > 0) {
|
| 10254 |
+
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
| 10255 |
+
if (backend == nullptr) {
|
| 10256 |
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
|
| 10257 |
+
llama_free(ctx);
|
| 10258 |
+
return nullptr;
|
| 10259 |
+
}
|
| 10260 |
+
ctx->backends.push_back(backend);
|
| 10261 |
+
}
|
| 10262 |
#endif
|
| 10263 |
ctx->backend_cpu = ggml_backend_cpu_init();
|
| 10264 |
if (ctx->backend_cpu == nullptr) {
|
examples/talk-llama/llama.h
CHANGED
|
@@ -6,6 +6,9 @@
|
|
| 6 |
#ifdef GGML_USE_CUBLAS
|
| 7 |
#include "ggml-cuda.h"
|
| 8 |
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
|
|
|
|
|
|
|
|
|
| 9 |
#else
|
| 10 |
#define LLAMA_MAX_DEVICES 1
|
| 11 |
#endif // GGML_USE_CUBLAS
|
|
@@ -46,7 +49,7 @@
|
|
| 46 |
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
| 47 |
#define LLAMA_SESSION_VERSION 4
|
| 48 |
|
| 49 |
-
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
| 50 |
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
| 51 |
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
| 52 |
#endif
|
|
|
|
| 6 |
#ifdef GGML_USE_CUBLAS
|
| 7 |
#include "ggml-cuda.h"
|
| 8 |
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
| 9 |
+
#elif defined(GGML_USE_SYCL)
|
| 10 |
+
#include "ggml-sycl.h"
|
| 11 |
+
#define LLAMA_MAX_DEVICES GGML_SYCL_MAX_DEVICES
|
| 12 |
#else
|
| 13 |
#define LLAMA_MAX_DEVICES 1
|
| 14 |
#endif // GGML_USE_CUBLAS
|
|
|
|
| 49 |
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
| 50 |
#define LLAMA_SESSION_VERSION 4
|
| 51 |
|
| 52 |
+
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
|
| 53 |
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
| 54 |
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
| 55 |
#endif
|