ggerganov commited on
Commit
6bb34fb
·
1 Parent(s): dfd316d

talk-llama : sync llama.cpp

Browse files
examples/talk-llama/llama-sampling.cpp CHANGED
@@ -113,7 +113,7 @@ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
113
  }
114
 
115
  static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
116
- // TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
117
  // if (k >= (int32_t)cur_p->size) {
118
  // return;
119
  // }
@@ -733,101 +733,6 @@ struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
733
  };
734
  }
735
 
736
- // tail-free
737
-
738
- struct llama_sampler_tail_free {
739
- const float z;
740
- const size_t min_keep;
741
- };
742
-
743
- static const char * llama_sampler_tail_free_name(const struct llama_sampler * /*smpl*/) {
744
- return "tail-free";
745
- }
746
-
747
- static void llama_sampler_tail_free_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
748
- const auto * ctx = (llama_sampler_tail_free *) smpl->ctx;
749
-
750
- if (ctx->z >= 1.0f || cur_p->size <= 2) {
751
- return;
752
- }
753
-
754
- llama_sampler_softmax_impl(cur_p);
755
-
756
- // Compute the first and second derivatives
757
- std::vector<float> first_derivatives(cur_p->size - 1);
758
- std::vector<float> second_derivatives(cur_p->size - 2);
759
-
760
- for (size_t i = 0; i < first_derivatives.size(); ++i) {
761
- first_derivatives[i] = cur_p->data[i].p - cur_p->data[i + 1].p;
762
- }
763
- for (size_t i = 0; i < second_derivatives.size(); ++i) {
764
- second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
765
- }
766
-
767
- // Calculate absolute value of second derivatives
768
- for (size_t i = 0; i < second_derivatives.size(); ++i) {
769
- second_derivatives[i] = std::abs(second_derivatives[i]);
770
- }
771
-
772
- // Normalize the second derivatives
773
- {
774
- const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
775
-
776
- if (second_derivatives_sum > 1e-6f) {
777
- for (float & value : second_derivatives) {
778
- value /= second_derivatives_sum;
779
- }
780
- } else {
781
- for (float & value : second_derivatives) {
782
- value = 1.0f / second_derivatives.size();
783
- }
784
- }
785
- }
786
-
787
- float cum_sum = 0.0f;
788
- size_t last_idx = cur_p->size;
789
- for (size_t i = 0; i < second_derivatives.size(); ++i) {
790
- cum_sum += second_derivatives[i];
791
-
792
- // Check if the running sum is greater than z or if we have kept at least min_keep tokens
793
- if (cum_sum > ctx->z && i >= ctx->min_keep) {
794
- last_idx = i;
795
- break;
796
- }
797
- }
798
-
799
- // Resize the output vector to keep only the tokens above the tail location
800
- cur_p->size = last_idx;
801
- }
802
-
803
- static struct llama_sampler * llama_sampler_tail_free_clone(const struct llama_sampler * smpl) {
804
- const auto * ctx = (const llama_sampler_tail_free *) smpl->ctx;
805
- return llama_sampler_init_tail_free(ctx->z, ctx->min_keep);
806
- }
807
-
808
- static void llama_sampler_tail_free_free(struct llama_sampler * smpl) {
809
- delete (llama_sampler_tail_free *) smpl->ctx;
810
- }
811
-
812
- static struct llama_sampler_i llama_sampler_tail_free_i = {
813
- /* .name = */ llama_sampler_tail_free_name,
814
- /* .accept = */ nullptr,
815
- /* .apply = */ llama_sampler_tail_free_apply,
816
- /* .reset = */ nullptr,
817
- /* .clone = */ llama_sampler_tail_free_clone,
818
- /* .free = */ llama_sampler_tail_free_free,
819
- };
820
-
821
- struct llama_sampler * llama_sampler_init_tail_free(float z, size_t min_keep) {
822
- return new llama_sampler {
823
- /* .iface = */ &llama_sampler_tail_free_i,
824
- /* .ctx = */ new llama_sampler_tail_free {
825
- /* .z = */ z,
826
- /*. min_keep = */ min_keep,
827
- },
828
- };
829
- }
830
-
831
  // typical
832
 
833
  struct llama_sampler_typical {
@@ -1971,8 +1876,11 @@ static void llama_sampler_dry_reset(struct llama_sampler * smpl) {
1971
  static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler * smpl) {
1972
  const auto * ctx = (llama_sampler_dry *) smpl->ctx;
1973
 
1974
- // nullptr is passed as vocab because it is only needed for raw sequence breaker processing, which we have already done and will be copying
1975
- auto * result = llama_sampler_init_dry(nullptr, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
 
 
 
1976
  // Copy the state, including the processed breakers
1977
  {
1978
  auto * result_ctx = (llama_sampler_dry *) result->ctx;
 
113
  }
114
 
115
  static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
116
+ // TODO: move bucket sort to separate function so that top_p/typical/softmax first is equally fast
117
  // if (k >= (int32_t)cur_p->size) {
118
  // return;
119
  // }
 
733
  };
734
  }
735
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
736
  // typical
737
 
738
  struct llama_sampler_typical {
 
1876
  static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler * smpl) {
1877
  const auto * ctx = (llama_sampler_dry *) smpl->ctx;
1878
 
1879
+ llama_vocab dummy_vocab;
1880
+
1881
+ // dummy vocab is passed because it is only needed for raw sequence breaker processing, which we have already done and will simply be copying
1882
+ auto * result = llama_sampler_init_dry_impl(dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
1883
+
1884
  // Copy the state, including the processed breakers
1885
  {
1886
  auto * result_ctx = (llama_sampler_dry *) result->ctx;
examples/talk-llama/llama.cpp CHANGED
The diff for this file is too large to render. See raw diff
 
examples/talk-llama/llama.h CHANGED
@@ -2,6 +2,7 @@
2
  #define LLAMA_H
3
 
4
  #include "ggml.h"
 
5
  #include "ggml-backend.h"
6
 
7
  #include <stddef.h>
@@ -205,7 +206,7 @@ extern "C" {
205
  enum llama_split_mode {
206
  LLAMA_SPLIT_MODE_NONE = 0, // single GPU
207
  LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
208
- LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
209
  };
210
 
211
  // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
@@ -274,10 +275,7 @@ extern "C" {
274
  int32_t n_gpu_layers; // number of layers to store in VRAM
275
  enum llama_split_mode split_mode; // how to split the model across multiple GPUs
276
 
277
- // main_gpu interpretation depends on split_mode:
278
- // LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
279
- // LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
280
- // LLAMA_SPLIT_MODE_LAYER: ignored
281
  int32_t main_gpu;
282
 
283
  // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
@@ -799,7 +797,7 @@ extern "C" {
799
  // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
800
  // Stores the encoder output internally for later use by the decoder cross-attention layers.
801
  // 0 - success
802
- // < 0 - error
803
  LLAMA_API int32_t llama_encode(
804
  struct llama_context * ctx,
805
  struct llama_batch batch);
@@ -807,7 +805,7 @@ extern "C" {
807
  // Positive return values does not mean a fatal error, but rather a warning.
808
  // 0 - success
809
  // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
810
- // < 0 - error
811
  LLAMA_API int32_t llama_decode(
812
  struct llama_context * ctx,
813
  struct llama_batch batch);
@@ -1087,9 +1085,6 @@ extern "C" {
1087
  /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
1088
  LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep);
1089
 
1090
- /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
1091
- LLAMA_API struct llama_sampler * llama_sampler_init_tail_free (float z, size_t min_keep);
1092
-
1093
  /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
1094
  LLAMA_API struct llama_sampler * llama_sampler_init_typical (float p, size_t min_keep);
1095
 
 
2
  #define LLAMA_H
3
 
4
  #include "ggml.h"
5
+ #include "ggml-cpu.h"
6
  #include "ggml-backend.h"
7
 
8
  #include <stddef.h>
 
206
  enum llama_split_mode {
207
  LLAMA_SPLIT_MODE_NONE = 0, // single GPU
208
  LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
209
+ LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
210
  };
211
 
212
  // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
 
275
  int32_t n_gpu_layers; // number of layers to store in VRAM
276
  enum llama_split_mode split_mode; // how to split the model across multiple GPUs
277
 
278
+ // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
 
 
 
279
  int32_t main_gpu;
280
 
281
  // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
 
797
  // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
798
  // Stores the encoder output internally for later use by the decoder cross-attention layers.
799
  // 0 - success
800
+ // < 0 - error. the KV cache state is restored to the state before this call
801
  LLAMA_API int32_t llama_encode(
802
  struct llama_context * ctx,
803
  struct llama_batch batch);
 
805
  // Positive return values does not mean a fatal error, but rather a warning.
806
  // 0 - success
807
  // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
808
+ // < 0 - error. the KV cache state is restored to the state before this call
809
  LLAMA_API int32_t llama_decode(
810
  struct llama_context * ctx,
811
  struct llama_batch batch);
 
1085
  /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
1086
  LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep);
1087
 
 
 
 
1088
  /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
1089
  LLAMA_API struct llama_sampler * llama_sampler_init_typical (float p, size_t min_keep);
1090