ggerganov commited on
Commit
2cfc05a
·
unverified ·
1 Parent(s): b2abb1b

examples : fix build + compile warnings (close #1256)

Browse files
examples/common.cpp CHANGED
@@ -792,7 +792,7 @@ bool sam_params_parse(int argc, char ** argv, sam_params & params) {
792
  return true;
793
  }
794
 
795
- void sam_print_usage(int argc, char ** argv, const sam_params & params) {
796
  fprintf(stderr, "usage: %s [options]\n", argv[0]);
797
  fprintf(stderr, "\n");
798
  fprintf(stderr, "options:\n");
 
792
  return true;
793
  }
794
 
795
+ void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) {
796
  fprintf(stderr, "usage: %s [options]\n", argv[0]);
797
  fprintf(stderr, "\n");
798
  fprintf(stderr, "options:\n");
examples/lsp/lsp.cpp CHANGED
@@ -324,12 +324,12 @@ json register_commandset(struct whisper_context * ctx, json jparams, std::vector
324
  commandset_list.push_back(cs);
325
  return json{{"index",index}};
326
  }
327
- json seek(struct whisper_context * ctx, audio_async &audio, json params) {
328
  // whisper_state has the pertinent offsets, but there also seem to be a large
329
  // number of scratch buffers that would prevent rewinding context in a manner similar to llama
330
  // I'll give this a another pass once everything else is implemented,
331
  // but for now, it's unsupported
332
- throw json{
333
  {"code", -32601},
334
  {"message", "Seeking is not yet supported."}
335
  };
@@ -412,7 +412,7 @@ void process_loop(struct whisper_context * ctx, audio_async &audio, const whispe
412
  jobqueue.pop_front();
413
  // send response
414
  std::string data = resp.dump(-1, ' ', false, json::error_handler_t::replace);
415
- fprintf(stdout, "Content-Length: %d\r\n\r\n%s\n", data.length()+1, data.c_str());
416
  std::cout.flush();
417
 
418
  }
 
324
  commandset_list.push_back(cs);
325
  return json{{"index",index}};
326
  }
327
+ json seek(struct whisper_context * /*ctx*/, audio_async & /*audio*/, json /*params*/) {
328
  // whisper_state has the pertinent offsets, but there also seem to be a large
329
  // number of scratch buffers that would prevent rewinding context in a manner similar to llama
330
  // I'll give this a another pass once everything else is implemented,
331
  // but for now, it's unsupported
332
+ throw json {
333
  {"code", -32601},
334
  {"message", "Seeking is not yet supported."}
335
  };
 
412
  jobqueue.pop_front();
413
  // send response
414
  std::string data = resp.dump(-1, ' ', false, json::error_handler_t::replace);
415
+ fprintf(stdout, "Content-Length: %d\r\n\r\n%s\n", (int)data.length()+1, data.c_str());
416
  std::cout.flush();
417
 
418
  }
examples/main/main.cpp CHANGED
@@ -260,7 +260,7 @@ std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s
260
 
261
  return speaker;
262
  }
263
- void whisper_print_progress_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int progress, void * user_data) {
264
  int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
265
  int * progress_prev = &(((whisper_print_user_data *) user_data)->progress_prev);
266
  if (progress >= *progress_prev + progress_step) {
@@ -492,7 +492,7 @@ bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_
492
  return true;
493
  }
494
 
495
- bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
496
  std::ofstream fout(fname);
497
  fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
498
 
 
260
 
261
  return speaker;
262
  }
263
+ void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
264
  int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
265
  int * progress_prev = &(((whisper_print_user_data *) user_data)->progress_prev);
266
  if (progress >= *progress_prev + progress_step) {
 
492
  return true;
493
  }
494
 
495
+ bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & /*params*/, std::vector<std::vector<float>> /*pcmf32s*/) {
496
  std::ofstream fout(fname);
497
  fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
498
 
examples/talk-llama/llama.cpp CHANGED
@@ -1164,7 +1164,7 @@ static bool llama_eval_internal(
1164
  const llama_token * tokens,
1165
  const int n_tokens,
1166
  const int n_past,
1167
- const int n_threads) {
1168
 
1169
  // enforce that the first token is BOS
1170
  if (n_past == 0 && tokens[0] != llama_token_bos()) {
@@ -1190,6 +1190,8 @@ static bool llama_eval_internal(
1190
  const int n_vocab = hparams.n_vocab;
1191
  const int n_rot = hparams.n_embd/hparams.n_head;
1192
 
 
 
1193
  auto & mem_per_token = lctx.mem_per_token;
1194
  auto & buf_compute = lctx.buf_compute;
1195
 
@@ -1204,7 +1206,7 @@ static bool llama_eval_internal(
1204
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1205
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1206
  ggml_cgraph gf = {};
1207
- gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1208
 
1209
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1210
  ggml_set_name(embd, "embd");
@@ -1221,7 +1223,7 @@ static bool llama_eval_internal(
1221
 
1222
  // norm
1223
  {
1224
- cur = ggml_rms_norm(ctx0, inpL);
1225
 
1226
  // cur = cur*attention_norm(broadcasted)
1227
  cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
@@ -1329,7 +1331,7 @@ static bool llama_eval_internal(
1329
  {
1330
  // norm
1331
  {
1332
- cur = ggml_rms_norm(ctx0, inpFF);
1333
 
1334
  // cur = cur*ffn_norm(broadcasted)
1335
  cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
@@ -1367,7 +1369,7 @@ static bool llama_eval_internal(
1367
  // norm
1368
  {
1369
 
1370
- inpL = ggml_rms_norm(ctx0, inpL);
1371
 
1372
  // inpL = inpL*norm(broadcasted)
1373
  inpL = ggml_mul(ctx0, inpL, model.norm);
@@ -1384,8 +1386,8 @@ static bool llama_eval_internal(
1384
  //inpL = ggml_soft_max_inplace(ctx0, inpL);
1385
 
1386
  // run the computation
1387
- ggml_build_forward_expand(&gf, inpL);
1388
- ggml_graph_compute (ctx0, &gf);
1389
 
1390
  #ifdef GGML_PERF
1391
  // print timing information per ggml operation (for debugging purposes)
@@ -2488,8 +2490,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2488
  }
2489
 
2490
  struct ggml_cgraph gf = ggml_build_forward(r);
2491
- gf.n_threads = n_threads;
2492
- ggml_graph_compute(lora_ctx, &gf);
2493
 
2494
  // we won't need these tensors again, reset the context to save memory
2495
  ggml_free(lora_ctx);
@@ -2635,7 +2636,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
2635
 
2636
  ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2637
  ggml_cgraph gf{};
2638
- gf.n_threads = 1;
2639
 
2640
  ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
2641
  kout3d->data = out;
@@ -2655,7 +2655,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
2655
 
2656
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
2657
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
2658
- ggml_graph_compute(cpy_ctx, &gf);
2659
 
2660
  ggml_free(cpy_ctx);
2661
  }
@@ -2743,7 +2743,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
2743
 
2744
  ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2745
  ggml_cgraph gf{};
2746
- gf.n_threads = 1;
2747
 
2748
  ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
2749
  kin3d->data = (void *) inp;
@@ -2763,7 +2762,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
2763
 
2764
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
2765
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
2766
- ggml_graph_compute(cpy_ctx, &gf);
2767
 
2768
  ggml_free(cpy_ctx);
2769
  }
 
1164
  const llama_token * tokens,
1165
  const int n_tokens,
1166
  const int n_past,
1167
+ int n_threads) {
1168
 
1169
  // enforce that the first token is BOS
1170
  if (n_past == 0 && tokens[0] != llama_token_bos()) {
 
1190
  const int n_vocab = hparams.n_vocab;
1191
  const int n_rot = hparams.n_embd/hparams.n_head;
1192
 
1193
+ const float eps = 5e-6f; // TODO: take from hparams
1194
+
1195
  auto & mem_per_token = lctx.mem_per_token;
1196
  auto & buf_compute = lctx.buf_compute;
1197
 
 
1206
  // for big prompts, if BLAS is enabled, it is better to use only one thread
1207
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1208
  ggml_cgraph gf = {};
1209
+ n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
1210
 
1211
  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1212
  ggml_set_name(embd, "embd");
 
1223
 
1224
  // norm
1225
  {
1226
+ cur = ggml_rms_norm(ctx0, inpL, eps);
1227
 
1228
  // cur = cur*attention_norm(broadcasted)
1229
  cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
 
1331
  {
1332
  // norm
1333
  {
1334
+ cur = ggml_rms_norm(ctx0, inpFF, eps);
1335
 
1336
  // cur = cur*ffn_norm(broadcasted)
1337
  cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
 
1369
  // norm
1370
  {
1371
 
1372
+ inpL = ggml_rms_norm(ctx0, inpL, eps);
1373
 
1374
  // inpL = inpL*norm(broadcasted)
1375
  inpL = ggml_mul(ctx0, inpL, model.norm);
 
1386
  //inpL = ggml_soft_max_inplace(ctx0, inpL);
1387
 
1388
  // run the computation
1389
+ ggml_build_forward_expand (&gf, inpL);
1390
+ ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
1391
 
1392
  #ifdef GGML_PERF
1393
  // print timing information per ggml operation (for debugging purposes)
 
2490
  }
2491
 
2492
  struct ggml_cgraph gf = ggml_build_forward(r);
2493
+ ggml_graph_compute_with_ctx(lora_ctx, &gf, n_threads);
 
2494
 
2495
  // we won't need these tensors again, reset the context to save memory
2496
  ggml_free(lora_ctx);
 
2636
 
2637
  ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2638
  ggml_cgraph gf{};
 
2639
 
2640
  ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
2641
  kout3d->data = out;
 
2655
 
2656
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
2657
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
2658
+ ggml_graph_compute_with_ctx(cpy_ctx, &gf, 1);
2659
 
2660
  ggml_free(cpy_ctx);
2661
  }
 
2743
 
2744
  ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
2745
  ggml_cgraph gf{};
 
2746
 
2747
  ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
2748
  kin3d->data = (void *) inp;
 
2762
 
2763
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
2764
  ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
2765
+ ggml_graph_compute_with_ctx(cpy_ctx, &gf, 1);
2766
 
2767
  ggml_free(cpy_ctx);
2768
  }
examples/talk-llama/talk-llama.cpp CHANGED
@@ -649,7 +649,10 @@ int main(int argc, char ** argv) {
649
  }
650
 
651
  text_to_speak = ::replace(text_to_speak, "\"", "");
652
- system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
 
 
 
653
 
654
  audio.clear();
655
 
 
649
  }
650
 
651
  text_to_speak = ::replace(text_to_speak, "\"", "");
652
+ int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
653
+ if (ret != 0) {
654
+ fprintf(stderr, "%s: failed to speak\n", __func__);
655
+ }
656
 
657
  audio.clear();
658
 
examples/talk/gpt-2.cpp CHANGED
@@ -191,9 +191,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
191
  // create the ggml context
192
  {
193
  struct ggml_init_params params = {
194
- .mem_size = ctx_size,
195
- .mem_buffer = NULL,
196
- .no_alloc = false,
197
  };
198
 
199
  model.ctx = ggml_init(params);
 
191
  // create the ggml context
192
  {
193
  struct ggml_init_params params = {
194
+ /*.mem_size =*/ ctx_size,
195
+ /*.mem_buffer =*/ NULL,
196
+ /*.no_alloc =*/ false,
197
  };
198
 
199
  model.ctx = ggml_init(params);
examples/talk/talk.cpp CHANGED
@@ -349,7 +349,10 @@ int main(int argc, char ** argv) {
349
  gpt2_set_prompt(ctx_gpt, prompt_base.c_str());
350
 
351
  text_to_speak = ::replace(text_to_speak, params.person + ": ", "");
352
- system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
 
 
 
353
 
354
  audio.clear();
355
 
 
349
  gpt2_set_prompt(ctx_gpt, prompt_base.c_str());
350
 
351
  text_to_speak = ::replace(text_to_speak, params.person + ": ", "");
352
+ int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
353
+ if (ret != 0) {
354
+ fprintf(stderr, "%s: system() failed!\n", __func__);
355
+ }
356
 
357
  audio.clear();
358