Spaces:
Sleeping
Sleeping
talk-llama : sync llama.cpp
Browse files
examples/talk-llama/llama-sampling.cpp
CHANGED
|
@@ -113,7 +113,7 @@ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
|
|
| 113 |
}
|
| 114 |
|
| 115 |
static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
|
| 116 |
-
// TODO: move bucket sort to separate function so that top_p/
|
| 117 |
// if (k >= (int32_t)cur_p->size) {
|
| 118 |
// return;
|
| 119 |
// }
|
|
@@ -733,101 +733,6 @@ struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
|
|
| 733 |
};
|
| 734 |
}
|
| 735 |
|
| 736 |
-
// tail-free
|
| 737 |
-
|
| 738 |
-
struct llama_sampler_tail_free {
|
| 739 |
-
const float z;
|
| 740 |
-
const size_t min_keep;
|
| 741 |
-
};
|
| 742 |
-
|
| 743 |
-
static const char * llama_sampler_tail_free_name(const struct llama_sampler * /*smpl*/) {
|
| 744 |
-
return "tail-free";
|
| 745 |
-
}
|
| 746 |
-
|
| 747 |
-
static void llama_sampler_tail_free_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
| 748 |
-
const auto * ctx = (llama_sampler_tail_free *) smpl->ctx;
|
| 749 |
-
|
| 750 |
-
if (ctx->z >= 1.0f || cur_p->size <= 2) {
|
| 751 |
-
return;
|
| 752 |
-
}
|
| 753 |
-
|
| 754 |
-
llama_sampler_softmax_impl(cur_p);
|
| 755 |
-
|
| 756 |
-
// Compute the first and second derivatives
|
| 757 |
-
std::vector<float> first_derivatives(cur_p->size - 1);
|
| 758 |
-
std::vector<float> second_derivatives(cur_p->size - 2);
|
| 759 |
-
|
| 760 |
-
for (size_t i = 0; i < first_derivatives.size(); ++i) {
|
| 761 |
-
first_derivatives[i] = cur_p->data[i].p - cur_p->data[i + 1].p;
|
| 762 |
-
}
|
| 763 |
-
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
| 764 |
-
second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
|
| 765 |
-
}
|
| 766 |
-
|
| 767 |
-
// Calculate absolute value of second derivatives
|
| 768 |
-
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
| 769 |
-
second_derivatives[i] = std::abs(second_derivatives[i]);
|
| 770 |
-
}
|
| 771 |
-
|
| 772 |
-
// Normalize the second derivatives
|
| 773 |
-
{
|
| 774 |
-
const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
|
| 775 |
-
|
| 776 |
-
if (second_derivatives_sum > 1e-6f) {
|
| 777 |
-
for (float & value : second_derivatives) {
|
| 778 |
-
value /= second_derivatives_sum;
|
| 779 |
-
}
|
| 780 |
-
} else {
|
| 781 |
-
for (float & value : second_derivatives) {
|
| 782 |
-
value = 1.0f / second_derivatives.size();
|
| 783 |
-
}
|
| 784 |
-
}
|
| 785 |
-
}
|
| 786 |
-
|
| 787 |
-
float cum_sum = 0.0f;
|
| 788 |
-
size_t last_idx = cur_p->size;
|
| 789 |
-
for (size_t i = 0; i < second_derivatives.size(); ++i) {
|
| 790 |
-
cum_sum += second_derivatives[i];
|
| 791 |
-
|
| 792 |
-
// Check if the running sum is greater than z or if we have kept at least min_keep tokens
|
| 793 |
-
if (cum_sum > ctx->z && i >= ctx->min_keep) {
|
| 794 |
-
last_idx = i;
|
| 795 |
-
break;
|
| 796 |
-
}
|
| 797 |
-
}
|
| 798 |
-
|
| 799 |
-
// Resize the output vector to keep only the tokens above the tail location
|
| 800 |
-
cur_p->size = last_idx;
|
| 801 |
-
}
|
| 802 |
-
|
| 803 |
-
static struct llama_sampler * llama_sampler_tail_free_clone(const struct llama_sampler * smpl) {
|
| 804 |
-
const auto * ctx = (const llama_sampler_tail_free *) smpl->ctx;
|
| 805 |
-
return llama_sampler_init_tail_free(ctx->z, ctx->min_keep);
|
| 806 |
-
}
|
| 807 |
-
|
| 808 |
-
static void llama_sampler_tail_free_free(struct llama_sampler * smpl) {
|
| 809 |
-
delete (llama_sampler_tail_free *) smpl->ctx;
|
| 810 |
-
}
|
| 811 |
-
|
| 812 |
-
static struct llama_sampler_i llama_sampler_tail_free_i = {
|
| 813 |
-
/* .name = */ llama_sampler_tail_free_name,
|
| 814 |
-
/* .accept = */ nullptr,
|
| 815 |
-
/* .apply = */ llama_sampler_tail_free_apply,
|
| 816 |
-
/* .reset = */ nullptr,
|
| 817 |
-
/* .clone = */ llama_sampler_tail_free_clone,
|
| 818 |
-
/* .free = */ llama_sampler_tail_free_free,
|
| 819 |
-
};
|
| 820 |
-
|
| 821 |
-
struct llama_sampler * llama_sampler_init_tail_free(float z, size_t min_keep) {
|
| 822 |
-
return new llama_sampler {
|
| 823 |
-
/* .iface = */ &llama_sampler_tail_free_i,
|
| 824 |
-
/* .ctx = */ new llama_sampler_tail_free {
|
| 825 |
-
/* .z = */ z,
|
| 826 |
-
/*. min_keep = */ min_keep,
|
| 827 |
-
},
|
| 828 |
-
};
|
| 829 |
-
}
|
| 830 |
-
|
| 831 |
// typical
|
| 832 |
|
| 833 |
struct llama_sampler_typical {
|
|
@@ -1971,8 +1876,11 @@ static void llama_sampler_dry_reset(struct llama_sampler * smpl) {
|
|
| 1971 |
static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler * smpl) {
|
| 1972 |
const auto * ctx = (llama_sampler_dry *) smpl->ctx;
|
| 1973 |
|
| 1974 |
-
|
| 1975 |
-
|
|
|
|
|
|
|
|
|
|
| 1976 |
// Copy the state, including the processed breakers
|
| 1977 |
{
|
| 1978 |
auto * result_ctx = (llama_sampler_dry *) result->ctx;
|
|
|
|
| 113 |
}
|
| 114 |
|
| 115 |
static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
|
| 116 |
+
// TODO: move bucket sort to separate function so that top_p/typical/softmax first is equally fast
|
| 117 |
// if (k >= (int32_t)cur_p->size) {
|
| 118 |
// return;
|
| 119 |
// }
|
|
|
|
| 733 |
};
|
| 734 |
}
|
| 735 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 736 |
// typical
|
| 737 |
|
| 738 |
struct llama_sampler_typical {
|
|
|
|
| 1876 |
static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler * smpl) {
|
| 1877 |
const auto * ctx = (llama_sampler_dry *) smpl->ctx;
|
| 1878 |
|
| 1879 |
+
llama_vocab dummy_vocab;
|
| 1880 |
+
|
| 1881 |
+
// dummy vocab is passed because it is only needed for raw sequence breaker processing, which we have already done and will simply be copying
|
| 1882 |
+
auto * result = llama_sampler_init_dry_impl(dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
|
| 1883 |
+
|
| 1884 |
// Copy the state, including the processed breakers
|
| 1885 |
{
|
| 1886 |
auto * result_ctx = (llama_sampler_dry *) result->ctx;
|
examples/talk-llama/llama.cpp
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
examples/talk-llama/llama.h
CHANGED
|
@@ -2,6 +2,7 @@
|
|
| 2 |
#define LLAMA_H
|
| 3 |
|
| 4 |
#include "ggml.h"
|
|
|
|
| 5 |
#include "ggml-backend.h"
|
| 6 |
|
| 7 |
#include <stddef.h>
|
|
@@ -205,7 +206,7 @@ extern "C" {
|
|
| 205 |
enum llama_split_mode {
|
| 206 |
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
| 207 |
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
| 208 |
-
LLAMA_SPLIT_MODE_ROW = 2, // split
|
| 209 |
};
|
| 210 |
|
| 211 |
// TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
|
|
@@ -274,10 +275,7 @@ extern "C" {
|
|
| 274 |
int32_t n_gpu_layers; // number of layers to store in VRAM
|
| 275 |
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
| 276 |
|
| 277 |
-
//
|
| 278 |
-
// LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
|
| 279 |
-
// LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
|
| 280 |
-
// LLAMA_SPLIT_MODE_LAYER: ignored
|
| 281 |
int32_t main_gpu;
|
| 282 |
|
| 283 |
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
|
@@ -799,7 +797,7 @@ extern "C" {
|
|
| 799 |
// Processes a batch of tokens with the ecoder part of the encoder-decoder model.
|
| 800 |
// Stores the encoder output internally for later use by the decoder cross-attention layers.
|
| 801 |
// 0 - success
|
| 802 |
-
// < 0 - error
|
| 803 |
LLAMA_API int32_t llama_encode(
|
| 804 |
struct llama_context * ctx,
|
| 805 |
struct llama_batch batch);
|
|
@@ -807,7 +805,7 @@ extern "C" {
|
|
| 807 |
// Positive return values does not mean a fatal error, but rather a warning.
|
| 808 |
// 0 - success
|
| 809 |
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
| 810 |
-
// < 0 - error
|
| 811 |
LLAMA_API int32_t llama_decode(
|
| 812 |
struct llama_context * ctx,
|
| 813 |
struct llama_batch batch);
|
|
@@ -1087,9 +1085,6 @@ extern "C" {
|
|
| 1087 |
/// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
|
| 1088 |
LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep);
|
| 1089 |
|
| 1090 |
-
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
| 1091 |
-
LLAMA_API struct llama_sampler * llama_sampler_init_tail_free (float z, size_t min_keep);
|
| 1092 |
-
|
| 1093 |
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
| 1094 |
LLAMA_API struct llama_sampler * llama_sampler_init_typical (float p, size_t min_keep);
|
| 1095 |
|
|
|
|
| 2 |
#define LLAMA_H
|
| 3 |
|
| 4 |
#include "ggml.h"
|
| 5 |
+
#include "ggml-cpu.h"
|
| 6 |
#include "ggml-backend.h"
|
| 7 |
|
| 8 |
#include <stddef.h>
|
|
|
|
| 206 |
enum llama_split_mode {
|
| 207 |
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
| 208 |
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
| 209 |
+
LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
|
| 210 |
};
|
| 211 |
|
| 212 |
// TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
|
|
|
|
| 275 |
int32_t n_gpu_layers; // number of layers to store in VRAM
|
| 276 |
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
| 277 |
|
| 278 |
+
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
|
|
|
|
|
|
|
|
|
|
| 279 |
int32_t main_gpu;
|
| 280 |
|
| 281 |
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
|
|
|
| 797 |
// Processes a batch of tokens with the ecoder part of the encoder-decoder model.
|
| 798 |
// Stores the encoder output internally for later use by the decoder cross-attention layers.
|
| 799 |
// 0 - success
|
| 800 |
+
// < 0 - error. the KV cache state is restored to the state before this call
|
| 801 |
LLAMA_API int32_t llama_encode(
|
| 802 |
struct llama_context * ctx,
|
| 803 |
struct llama_batch batch);
|
|
|
|
| 805 |
// Positive return values does not mean a fatal error, but rather a warning.
|
| 806 |
// 0 - success
|
| 807 |
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
| 808 |
+
// < 0 - error. the KV cache state is restored to the state before this call
|
| 809 |
LLAMA_API int32_t llama_decode(
|
| 810 |
struct llama_context * ctx,
|
| 811 |
struct llama_batch batch);
|
|
|
|
| 1085 |
/// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
|
| 1086 |
LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep);
|
| 1087 |
|
|
|
|
|
|
|
|
|
|
| 1088 |
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
| 1089 |
LLAMA_API struct llama_sampler * llama_sampler_init_typical (float p, size_t min_keep);
|
| 1090 |
|