Spaces:
Running
Running
examples : fix build + compile warnings (close #1256)
Browse files- examples/common.cpp +1 -1
- examples/lsp/lsp.cpp +3 -3
- examples/main/main.cpp +2 -2
- examples/talk-llama/llama.cpp +12 -13
- examples/talk-llama/talk-llama.cpp +4 -1
- examples/talk/gpt-2.cpp +3 -3
- examples/talk/talk.cpp +4 -1
examples/common.cpp
CHANGED
|
@@ -792,7 +792,7 @@ bool sam_params_parse(int argc, char ** argv, sam_params & params) {
|
|
| 792 |
return true;
|
| 793 |
}
|
| 794 |
|
| 795 |
-
void sam_print_usage(int argc
|
| 796 |
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
| 797 |
fprintf(stderr, "\n");
|
| 798 |
fprintf(stderr, "options:\n");
|
|
|
|
| 792 |
return true;
|
| 793 |
}
|
| 794 |
|
| 795 |
+
void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) {
|
| 796 |
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
| 797 |
fprintf(stderr, "\n");
|
| 798 |
fprintf(stderr, "options:\n");
|
examples/lsp/lsp.cpp
CHANGED
|
@@ -324,12 +324,12 @@ json register_commandset(struct whisper_context * ctx, json jparams, std::vector
|
|
| 324 |
commandset_list.push_back(cs);
|
| 325 |
return json{{"index",index}};
|
| 326 |
}
|
| 327 |
-
json seek(struct whisper_context * ctx
|
| 328 |
// whisper_state has the pertinent offsets, but there also seem to be a large
|
| 329 |
// number of scratch buffers that would prevent rewinding context in a manner similar to llama
|
| 330 |
// I'll give this a another pass once everything else is implemented,
|
| 331 |
// but for now, it's unsupported
|
| 332 |
-
throw json{
|
| 333 |
{"code", -32601},
|
| 334 |
{"message", "Seeking is not yet supported."}
|
| 335 |
};
|
|
@@ -412,7 +412,7 @@ void process_loop(struct whisper_context * ctx, audio_async &audio, const whispe
|
|
| 412 |
jobqueue.pop_front();
|
| 413 |
// send response
|
| 414 |
std::string data = resp.dump(-1, ' ', false, json::error_handler_t::replace);
|
| 415 |
-
fprintf(stdout, "Content-Length: %d\r\n\r\n%s\n", data.length()+1, data.c_str());
|
| 416 |
std::cout.flush();
|
| 417 |
|
| 418 |
}
|
|
|
|
| 324 |
commandset_list.push_back(cs);
|
| 325 |
return json{{"index",index}};
|
| 326 |
}
|
| 327 |
+
json seek(struct whisper_context * /*ctx*/, audio_async & /*audio*/, json /*params*/) {
|
| 328 |
// whisper_state has the pertinent offsets, but there also seem to be a large
|
| 329 |
// number of scratch buffers that would prevent rewinding context in a manner similar to llama
|
| 330 |
// I'll give this a another pass once everything else is implemented,
|
| 331 |
// but for now, it's unsupported
|
| 332 |
+
throw json {
|
| 333 |
{"code", -32601},
|
| 334 |
{"message", "Seeking is not yet supported."}
|
| 335 |
};
|
|
|
|
| 412 |
jobqueue.pop_front();
|
| 413 |
// send response
|
| 414 |
std::string data = resp.dump(-1, ' ', false, json::error_handler_t::replace);
|
| 415 |
+
fprintf(stdout, "Content-Length: %d\r\n\r\n%s\n", (int)data.length()+1, data.c_str());
|
| 416 |
std::cout.flush();
|
| 417 |
|
| 418 |
}
|
examples/main/main.cpp
CHANGED
|
@@ -260,7 +260,7 @@ std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s
|
|
| 260 |
|
| 261 |
return speaker;
|
| 262 |
}
|
| 263 |
-
void whisper_print_progress_callback(struct whisper_context * ctx
|
| 264 |
int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
|
| 265 |
int * progress_prev = &(((whisper_print_user_data *) user_data)->progress_prev);
|
| 266 |
if (progress >= *progress_prev + progress_step) {
|
|
@@ -492,7 +492,7 @@ bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_
|
|
| 492 |
return true;
|
| 493 |
}
|
| 494 |
|
| 495 |
-
bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & params
|
| 496 |
std::ofstream fout(fname);
|
| 497 |
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
| 498 |
|
|
|
|
| 260 |
|
| 261 |
return speaker;
|
| 262 |
}
|
| 263 |
+
void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
|
| 264 |
int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
|
| 265 |
int * progress_prev = &(((whisper_print_user_data *) user_data)->progress_prev);
|
| 266 |
if (progress >= *progress_prev + progress_step) {
|
|
|
|
| 492 |
return true;
|
| 493 |
}
|
| 494 |
|
| 495 |
+
bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & /*params*/, std::vector<std::vector<float>> /*pcmf32s*/) {
|
| 496 |
std::ofstream fout(fname);
|
| 497 |
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
| 498 |
|
examples/talk-llama/llama.cpp
CHANGED
|
@@ -1164,7 +1164,7 @@ static bool llama_eval_internal(
|
|
| 1164 |
const llama_token * tokens,
|
| 1165 |
const int n_tokens,
|
| 1166 |
const int n_past,
|
| 1167 |
-
|
| 1168 |
|
| 1169 |
// enforce that the first token is BOS
|
| 1170 |
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
|
@@ -1190,6 +1190,8 @@ static bool llama_eval_internal(
|
|
| 1190 |
const int n_vocab = hparams.n_vocab;
|
| 1191 |
const int n_rot = hparams.n_embd/hparams.n_head;
|
| 1192 |
|
|
|
|
|
|
|
| 1193 |
auto & mem_per_token = lctx.mem_per_token;
|
| 1194 |
auto & buf_compute = lctx.buf_compute;
|
| 1195 |
|
|
@@ -1204,7 +1206,7 @@ static bool llama_eval_internal(
|
|
| 1204 |
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
| 1205 |
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
| 1206 |
ggml_cgraph gf = {};
|
| 1207 |
-
|
| 1208 |
|
| 1209 |
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
| 1210 |
ggml_set_name(embd, "embd");
|
|
@@ -1221,7 +1223,7 @@ static bool llama_eval_internal(
|
|
| 1221 |
|
| 1222 |
// norm
|
| 1223 |
{
|
| 1224 |
-
cur = ggml_rms_norm(ctx0, inpL);
|
| 1225 |
|
| 1226 |
// cur = cur*attention_norm(broadcasted)
|
| 1227 |
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
|
@@ -1329,7 +1331,7 @@ static bool llama_eval_internal(
|
|
| 1329 |
{
|
| 1330 |
// norm
|
| 1331 |
{
|
| 1332 |
-
cur = ggml_rms_norm(ctx0, inpFF);
|
| 1333 |
|
| 1334 |
// cur = cur*ffn_norm(broadcasted)
|
| 1335 |
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
|
@@ -1367,7 +1369,7 @@ static bool llama_eval_internal(
|
|
| 1367 |
// norm
|
| 1368 |
{
|
| 1369 |
|
| 1370 |
-
inpL = ggml_rms_norm(ctx0, inpL);
|
| 1371 |
|
| 1372 |
// inpL = inpL*norm(broadcasted)
|
| 1373 |
inpL = ggml_mul(ctx0, inpL, model.norm);
|
|
@@ -1384,8 +1386,8 @@ static bool llama_eval_internal(
|
|
| 1384 |
//inpL = ggml_soft_max_inplace(ctx0, inpL);
|
| 1385 |
|
| 1386 |
// run the computation
|
| 1387 |
-
ggml_build_forward_expand(&gf, inpL);
|
| 1388 |
-
|
| 1389 |
|
| 1390 |
#ifdef GGML_PERF
|
| 1391 |
// print timing information per ggml operation (for debugging purposes)
|
|
@@ -2488,8 +2490,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
| 2488 |
}
|
| 2489 |
|
| 2490 |
struct ggml_cgraph gf = ggml_build_forward(r);
|
| 2491 |
-
gf
|
| 2492 |
-
ggml_graph_compute(lora_ctx, &gf);
|
| 2493 |
|
| 2494 |
// we won't need these tensors again, reset the context to save memory
|
| 2495 |
ggml_free(lora_ctx);
|
|
@@ -2635,7 +2636,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
| 2635 |
|
| 2636 |
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
| 2637 |
ggml_cgraph gf{};
|
| 2638 |
-
gf.n_threads = 1;
|
| 2639 |
|
| 2640 |
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
| 2641 |
kout3d->data = out;
|
|
@@ -2655,7 +2655,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
| 2655 |
|
| 2656 |
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
| 2657 |
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
| 2658 |
-
|
| 2659 |
|
| 2660 |
ggml_free(cpy_ctx);
|
| 2661 |
}
|
|
@@ -2743,7 +2743,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
| 2743 |
|
| 2744 |
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
| 2745 |
ggml_cgraph gf{};
|
| 2746 |
-
gf.n_threads = 1;
|
| 2747 |
|
| 2748 |
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
| 2749 |
kin3d->data = (void *) inp;
|
|
@@ -2763,7 +2762,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
| 2763 |
|
| 2764 |
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
| 2765 |
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
| 2766 |
-
|
| 2767 |
|
| 2768 |
ggml_free(cpy_ctx);
|
| 2769 |
}
|
|
|
|
| 1164 |
const llama_token * tokens,
|
| 1165 |
const int n_tokens,
|
| 1166 |
const int n_past,
|
| 1167 |
+
int n_threads) {
|
| 1168 |
|
| 1169 |
// enforce that the first token is BOS
|
| 1170 |
if (n_past == 0 && tokens[0] != llama_token_bos()) {
|
|
|
|
| 1190 |
const int n_vocab = hparams.n_vocab;
|
| 1191 |
const int n_rot = hparams.n_embd/hparams.n_head;
|
| 1192 |
|
| 1193 |
+
const float eps = 5e-6f; // TODO: take from hparams
|
| 1194 |
+
|
| 1195 |
auto & mem_per_token = lctx.mem_per_token;
|
| 1196 |
auto & buf_compute = lctx.buf_compute;
|
| 1197 |
|
|
|
|
| 1206 |
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
| 1207 |
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
| 1208 |
ggml_cgraph gf = {};
|
| 1209 |
+
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
| 1210 |
|
| 1211 |
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
| 1212 |
ggml_set_name(embd, "embd");
|
|
|
|
| 1223 |
|
| 1224 |
// norm
|
| 1225 |
{
|
| 1226 |
+
cur = ggml_rms_norm(ctx0, inpL, eps);
|
| 1227 |
|
| 1228 |
// cur = cur*attention_norm(broadcasted)
|
| 1229 |
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
|
|
|
|
| 1331 |
{
|
| 1332 |
// norm
|
| 1333 |
{
|
| 1334 |
+
cur = ggml_rms_norm(ctx0, inpFF, eps);
|
| 1335 |
|
| 1336 |
// cur = cur*ffn_norm(broadcasted)
|
| 1337 |
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
|
|
|
| 1369 |
// norm
|
| 1370 |
{
|
| 1371 |
|
| 1372 |
+
inpL = ggml_rms_norm(ctx0, inpL, eps);
|
| 1373 |
|
| 1374 |
// inpL = inpL*norm(broadcasted)
|
| 1375 |
inpL = ggml_mul(ctx0, inpL, model.norm);
|
|
|
|
| 1386 |
//inpL = ggml_soft_max_inplace(ctx0, inpL);
|
| 1387 |
|
| 1388 |
// run the computation
|
| 1389 |
+
ggml_build_forward_expand (&gf, inpL);
|
| 1390 |
+
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
| 1391 |
|
| 1392 |
#ifdef GGML_PERF
|
| 1393 |
// print timing information per ggml operation (for debugging purposes)
|
|
|
|
| 2490 |
}
|
| 2491 |
|
| 2492 |
struct ggml_cgraph gf = ggml_build_forward(r);
|
| 2493 |
+
ggml_graph_compute_with_ctx(lora_ctx, &gf, n_threads);
|
|
|
|
| 2494 |
|
| 2495 |
// we won't need these tensors again, reset the context to save memory
|
| 2496 |
ggml_free(lora_ctx);
|
|
|
|
| 2636 |
|
| 2637 |
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
| 2638 |
ggml_cgraph gf{};
|
|
|
|
| 2639 |
|
| 2640 |
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
| 2641 |
kout3d->data = out;
|
|
|
|
| 2655 |
|
| 2656 |
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
| 2657 |
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
| 2658 |
+
ggml_graph_compute_with_ctx(cpy_ctx, &gf, 1);
|
| 2659 |
|
| 2660 |
ggml_free(cpy_ctx);
|
| 2661 |
}
|
|
|
|
| 2743 |
|
| 2744 |
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
| 2745 |
ggml_cgraph gf{};
|
|
|
|
| 2746 |
|
| 2747 |
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
| 2748 |
kin3d->data = (void *) inp;
|
|
|
|
| 2762 |
|
| 2763 |
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
| 2764 |
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
| 2765 |
+
ggml_graph_compute_with_ctx(cpy_ctx, &gf, 1);
|
| 2766 |
|
| 2767 |
ggml_free(cpy_ctx);
|
| 2768 |
}
|
examples/talk-llama/talk-llama.cpp
CHANGED
|
@@ -649,7 +649,10 @@ int main(int argc, char ** argv) {
|
|
| 649 |
}
|
| 650 |
|
| 651 |
text_to_speak = ::replace(text_to_speak, "\"", "");
|
| 652 |
-
system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
|
|
|
|
|
|
|
|
|
|
| 653 |
|
| 654 |
audio.clear();
|
| 655 |
|
|
|
|
| 649 |
}
|
| 650 |
|
| 651 |
text_to_speak = ::replace(text_to_speak, "\"", "");
|
| 652 |
+
int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
|
| 653 |
+
if (ret != 0) {
|
| 654 |
+
fprintf(stderr, "%s: failed to speak\n", __func__);
|
| 655 |
+
}
|
| 656 |
|
| 657 |
audio.clear();
|
| 658 |
|
examples/talk/gpt-2.cpp
CHANGED
|
@@ -191,9 +191,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|
| 191 |
// create the ggml context
|
| 192 |
{
|
| 193 |
struct ggml_init_params params = {
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
};
|
| 198 |
|
| 199 |
model.ctx = ggml_init(params);
|
|
|
|
| 191 |
// create the ggml context
|
| 192 |
{
|
| 193 |
struct ggml_init_params params = {
|
| 194 |
+
/*.mem_size =*/ ctx_size,
|
| 195 |
+
/*.mem_buffer =*/ NULL,
|
| 196 |
+
/*.no_alloc =*/ false,
|
| 197 |
};
|
| 198 |
|
| 199 |
model.ctx = ggml_init(params);
|
examples/talk/talk.cpp
CHANGED
|
@@ -349,7 +349,10 @@ int main(int argc, char ** argv) {
|
|
| 349 |
gpt2_set_prompt(ctx_gpt, prompt_base.c_str());
|
| 350 |
|
| 351 |
text_to_speak = ::replace(text_to_speak, params.person + ": ", "");
|
| 352 |
-
system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
|
|
|
|
|
|
|
|
|
|
| 353 |
|
| 354 |
audio.clear();
|
| 355 |
|
|
|
|
| 349 |
gpt2_set_prompt(ctx_gpt, prompt_base.c_str());
|
| 350 |
|
| 351 |
text_to_speak = ::replace(text_to_speak, params.person + ": ", "");
|
| 352 |
+
int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
|
| 353 |
+
if (ret != 0) {
|
| 354 |
+
fprintf(stderr, "%s: system() failed!\n", __func__);
|
| 355 |
+
}
|
| 356 |
|
| 357 |
audio.clear();
|
| 358 |
|