ggerganov commited on
Commit
ff21a60
·
unverified ·
1 Parent(s): 8528ec5

talk.wasm : refactoring + update README.md

Browse files
bindings/javascript/whisper.js CHANGED
The diff for this file is too large to render. See raw diff
 
examples/talk.wasm/CMakeLists.txt CHANGED
@@ -6,6 +6,7 @@ set(TARGET libtalk)
6
 
7
  add_executable(${TARGET}
8
  emscripten.cpp
 
9
  )
10
 
11
  target_link_libraries(${TARGET} PRIVATE
 
6
 
7
  add_executable(${TARGET}
8
  emscripten.cpp
9
+ gpt-2.cpp
10
  )
11
 
12
  target_link_libraries(${TARGET} PRIVATE
examples/talk.wasm/README.md CHANGED
@@ -16,7 +16,13 @@ This demo leverages 2 modern neural network models to create a high-quality voic
16
 
17
  The web page does the processing locally on your machine. The processing of these heavy neural network models in the
18
  browser is possible by implementing them efficiently in C/C++ and using the browser's WebAssembly SIMD capabilities for
19
- extra performance. For more detailed information, checkout the [current repository](https://github.com/ggerganov/whisper.cpp).
 
 
 
 
 
 
20
 
21
  In order to run the models, the web page first needs to download the model data which is about ~350 MB. The model data
22
  is then cached in your browser's cache and can be reused in future visits without downloading it again.
@@ -33,11 +39,11 @@ In order to run this demo efficiently, you need to have the following:
33
  Notice that this demo is using the smallest GPT-2 model, so the generated text responses are not always very good.
34
  Also, the prompting strategy can likely be improved to achieve better results.
35
 
36
- The demo is quite computationally heavy - it's not usual to run these transformer models in a browser. Typically, they
37
- run on powerful GPU hardware. So for better experience, you do need to have a powerful computer.
38
 
39
- Probably in the near future, mobile browsers will start supporting WASM SIMD. This will allow to run the demo on your
40
- phone or tablet. But for now this functionality is not supported on mobile devices (at least not on iPhone).
41
 
42
  ## Todo
43
 
 
16
 
17
  The web page does the processing locally on your machine. The processing of these heavy neural network models in the
18
  browser is possible by implementing them efficiently in C/C++ and using the browser's WebAssembly SIMD capabilities for
19
+ extra performance:
20
+
21
+ - The Whisper C++ implementation is here: [whisper.h](/whisper.h) / [whisper.cpp](/whisper.cpp)
22
+ - The GPT-2 C++ implementation is here: [gpt-2.h](gpt-2.h) / [gpt-2.cpp](gpt-2.cpp)
23
+ - Both models use a custom tensor library implemented in C: [ggml.h](/ggml.h) / [ggml.c](/ggml.c)
24
+ - The HTML/JS layer is here: [index-tmpl.html](index-tmpl.html)
25
+ - The Emscripten bridge between C/C++ and JS is here: [emscripten.cpp](emscripten.cpp)
26
 
27
  In order to run the models, the web page first needs to download the model data which is about ~350 MB. The model data
28
  is then cached in your browser's cache and can be reused in future visits without downloading it again.
 
39
  Notice that this demo is using the smallest GPT-2 model, so the generated text responses are not always very good.
40
  Also, the prompting strategy can likely be improved to achieve better results.
41
 
42
+ The demo is quite computationally heavy, so you need a fast CPU. It's not usual to run these transformer models in a
43
+ browser. Typically, they run on powerful GPUs.
44
 
45
+ Currently, mobile browsers do not support the Fixed-width SIMD WebAssembly capability, so you cannot run this demo
46
+ on a phone or a tablet. Hopefully, in the near future this will become supported.
47
 
48
  ## Todo
49
 
examples/talk.wasm/emscripten.cpp CHANGED
@@ -1,985 +1,21 @@
1
  #include "ggml.h"
 
2
  #include "whisper.h"
3
 
4
  #include <emscripten.h>
5
  #include <emscripten/bind.h>
6
 
7
  #include <atomic>
8
- #include <cassert>
9
  #include <cmath>
10
- #include <cstdio>
11
- #include <cstring>
12
- #include <fstream>
13
- #include <map>
14
  #include <mutex>
15
  #include <string>
16
  #include <thread>
17
  #include <vector>
18
  #include <regex>
19
- #include <random>
20
-
21
- std::string to_timestamp(int64_t t) {
22
- int64_t sec = t/100;
23
- int64_t msec = t - sec*100;
24
- int64_t min = sec/60;
25
- sec = sec - min*60;
26
-
27
- char buf[32];
28
- snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int) min, (int) sec, (int) msec);
29
-
30
- return std::string(buf);
31
- }
32
-
33
- /////////////////////// GPT-2 BEGIN /////////////////////////
34
- // TODO: move to a separate file
35
-
36
- //
37
- // Vocab utils
38
- //
39
-
40
- struct gpt_vocab {
41
- using id = int32_t;
42
- using token = std::string;
43
-
44
- std::map<token, id> token_to_id;
45
- std::map<id, token> id_to_token;
46
- };
47
-
48
- void replace(std::string & str, const std::string & needle, const std::string & replacement) {
49
- size_t pos = 0;
50
- while ((pos = str.find(needle, pos)) != std::string::npos) {
51
- str.replace(pos, needle.length(), replacement);
52
- pos += replacement.length();
53
- }
54
- }
55
-
56
- std::map<std::string, int32_t> json_parse(const std::string & fname) {
57
- std::map<std::string, int32_t> result;
58
-
59
- // read file into string
60
- std::string json;
61
- {
62
- std::ifstream ifs(fname);
63
- if (!ifs) {
64
- fprintf(stderr, "Failed to open %s\n", fname.c_str());
65
- exit(1);
66
- }
67
-
68
- json = std::string((std::istreambuf_iterator<char>(ifs)),
69
- (std::istreambuf_iterator<char>()));
70
- }
71
-
72
- if (json[0] != '{') {
73
- return result;
74
- }
75
-
76
- // parse json
77
- {
78
- bool has_key = false;
79
- bool in_token = false;
80
-
81
- std::string str_key = "";
82
- std::string str_val = "";
83
-
84
- int n = json.size();
85
- for (int i = 1; i < n; ++i) {
86
- if (!in_token) {
87
- if (json[i] == ' ') continue;
88
- if (json[i] == '"') {
89
- in_token = true;
90
- continue;
91
- }
92
- } else {
93
- if (json[i] == '\\' && i+1 < n) {
94
- if (has_key == false) {
95
- str_key += json[i];
96
- } else {
97
- str_val += json[i];
98
- }
99
- ++i;
100
- } else if (json[i] == '"') {
101
- if (has_key == false) {
102
- has_key = true;
103
- ++i;
104
- while (json[i] == ' ') ++i;
105
- ++i; // :
106
- while (json[i] == ' ') ++i;
107
- if (json[i] != '\"') {
108
- while (json[i] != ',' && json[i] != '}') {
109
- str_val += json[i++];
110
- }
111
- has_key = false;
112
- } else {
113
- in_token = true;
114
- continue;
115
- }
116
- } else {
117
- has_key = false;
118
- }
119
-
120
- ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
121
- ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
122
- ::replace(str_key, "\\\"", "\""); // \\\" -> "
123
-
124
- try {
125
- result[str_key] = std::stoi(str_val);
126
- } catch (...) {
127
- //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
128
-
129
- }
130
- str_key = "";
131
- str_val = "";
132
- in_token = false;
133
- continue;
134
- }
135
- if (has_key == false) {
136
- str_key += json[i];
137
- } else {
138
- str_val += json[i];
139
- }
140
- }
141
- }
142
- }
143
-
144
- return result;
145
- }
146
-
147
- std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
148
- std::vector<std::string> words;
149
-
150
- // first split the text into words
151
- {
152
- std::string str = text;
153
- std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
154
-
155
- std::regex re(pat);
156
- std::smatch m;
157
-
158
- while (std::regex_search(str, m, re)) {
159
- for (auto x : m) {
160
- words.push_back(x);
161
- }
162
- str = m.suffix();
163
- }
164
- }
165
-
166
- // find the longest tokens that form the words:
167
- std::vector<gpt_vocab::id> tokens;
168
- for (const auto & word : words) {
169
- if (word.size() == 0) continue;
170
-
171
- int i = 0;
172
- int n = word.size();
173
- while (i < n) {
174
- int j = n;
175
- while (j > i) {
176
- auto it = vocab.token_to_id.find(word.substr(i, j-i));
177
- if (it != vocab.token_to_id.end()) {
178
- tokens.push_back(it->second);
179
- i = j;
180
- break;
181
- }
182
- --j;
183
- }
184
- if (i == n) {
185
- break;
186
- }
187
- if (j == i) {
188
- auto sub = word.substr(i, 1);
189
- if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
190
- tokens.push_back(vocab.token_to_id.at(sub));
191
- } else {
192
- fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
193
- }
194
- ++i;
195
- }
196
- }
197
- }
198
-
199
- return tokens;
200
- }
201
-
202
- bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
203
- printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
204
-
205
- vocab.token_to_id = ::json_parse(fname);
206
-
207
- for (const auto & kv : vocab.token_to_id) {
208
- vocab.id_to_token[kv.second] = kv.first;
209
- }
210
-
211
- printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
212
-
213
- // print the vocabulary
214
- //for (auto kv : vocab.token_to_id) {
215
- // printf("'%s' -> %d\n", kv.first.data(), kv.second);
216
- //}
217
-
218
- return true;
219
- }
220
-
221
- gpt_vocab::id gpt_sample_top_k_top_p(
222
- const gpt_vocab & vocab,
223
- const float * logits,
224
- int top_k,
225
- double top_p,
226
- double temp,
227
- std::mt19937 & rng) {
228
- int n_logits = vocab.id_to_token.size();
229
-
230
- std::vector<std::pair<double, gpt_vocab::id>> logits_id;
231
- logits_id.reserve(n_logits);
232
-
233
- for (int i = 0; i < n_logits; i++) {
234
- logits_id.push_back(std::make_pair(logits[i], i));
235
- }
236
-
237
- // find the top K tokens
238
- std::partial_sort(
239
- logits_id.begin(),
240
- logits_id.begin() + top_k, logits_id.end(),
241
- [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
242
- return a.first > b.first;
243
- });
244
-
245
- logits_id.resize(top_k);
246
-
247
- // normalize
248
- {
249
- double sum = 0.0f;
250
- for (int i = 0; i < (int)logits_id.size(); i++) {
251
- sum += logits_id[i].first;
252
- }
253
-
254
- sum = 1.0/sum;
255
- for (int i = 0; i < (int)logits_id.size(); i++) {
256
- logits_id[i].first *= sum;
257
- }
258
- }
259
-
260
- if (top_p < 1.0f) {
261
- {
262
- double cumsum = 0.0f;
263
- for (int i = 0; i < top_k; i++) {
264
- cumsum += logits_id[i].first;
265
- if (cumsum >= top_p) {
266
- logits_id.resize(i+1);
267
- break;
268
- }
269
- }
270
- }
271
-
272
- // normalize again
273
- {
274
- double sum = 0.0f;
275
- for (int i = 0; i < (int)logits_id.size(); i++) {
276
- sum += logits_id[i].first;
277
- }
278
-
279
- sum = 1.0/sum;
280
- for (int i = 0; i < (int)logits_id.size(); i++) {
281
- logits_id[i].first *= sum;
282
- }
283
- }
284
- }
285
-
286
- //printf("\n");
287
- //for (int i = 0; i < (int)logits_id.size(); i++) {
288
- // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), logits_id[i].first);
289
- //}
290
- //exit(0);
291
-
292
- // sample from the obtained distribution
293
- std::vector<double> probs;
294
- probs.reserve(logits_id.size());
295
-
296
- for (int i = 0; i < (int) logits_id.size(); i++) {
297
- probs.push_back(logits_id[i].first);
298
- }
299
-
300
- std::discrete_distribution<> dist(probs.begin(), probs.end());
301
- int idx = dist(rng);
302
-
303
- return logits_id[idx].second;
304
- }
305
-
306
- // default hparams (GPT-2 117M)
307
- struct gpt2_hparams {
308
- int32_t n_vocab = 50257;
309
- int32_t n_ctx = 1024;
310
- int32_t n_embd = 768;
311
- int32_t n_head = 12;
312
- int32_t n_layer = 12;
313
- int32_t f16 = 1;
314
- };
315
-
316
- struct gpt2_layer {
317
- // normalization
318
- struct ggml_tensor * ln_1_g;
319
- struct ggml_tensor * ln_1_b;
320
-
321
- struct ggml_tensor * ln_2_g;
322
- struct ggml_tensor * ln_2_b;
323
-
324
- // attention
325
- struct ggml_tensor * c_attn_attn_w;
326
- struct ggml_tensor * c_attn_attn_b;
327
-
328
- struct ggml_tensor * c_attn_proj_w;
329
- struct ggml_tensor * c_attn_proj_b;
330
-
331
- // mlp
332
- struct ggml_tensor * c_mlp_fc_w;
333
- struct ggml_tensor * c_mlp_fc_b;
334
-
335
- struct ggml_tensor * c_mlp_proj_w_trans; // transposed for efficiency
336
- struct ggml_tensor * c_mlp_proj_b;
337
- };
338
-
339
- struct gpt2_model {
340
- gpt2_hparams hparams;
341
-
342
- // normalization
343
- struct ggml_tensor * ln_f_g;
344
- struct ggml_tensor * ln_f_b;
345
-
346
- struct ggml_tensor * wte; // position embedding
347
- struct ggml_tensor * wpe; // token embedding
348
-
349
- std::vector<gpt2_layer> layers;
350
-
351
- // key + value memory
352
- struct ggml_tensor * memory_k;
353
- struct ggml_tensor * memory_v;
354
-
355
- //
356
- struct ggml_context * ctx;
357
- std::map<std::string, struct ggml_tensor *> tensors;
358
- };
359
-
360
- // load the model's weights from a file
361
- bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
362
- printf("%s: loading model from '%s'\n", __func__, fname.c_str());
363
-
364
- auto fin = std::ifstream(fname, std::ios::binary);
365
- if (!fin) {
366
- fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
367
- return false;
368
- }
369
-
370
- // verify magic
371
- {
372
- uint32_t magic;
373
- fin.read((char *) &magic, sizeof(magic));
374
- if (magic != 0x67676d6c) {
375
- fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
376
- return false;
377
- }
378
- }
379
-
380
- // load hparams
381
- {
382
- auto & hparams = model.hparams;
383
-
384
- fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
385
- fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
386
- fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
387
- fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
388
- fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
389
- fin.read((char *) &hparams.f16, sizeof(hparams.f16));
390
-
391
- printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
392
- printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
393
- printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
394
- printf("%s: n_head = %d\n", __func__, hparams.n_head);
395
- printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
396
- printf("%s: f16 = %d\n", __func__, hparams.f16);
397
- }
398
-
399
- // load vocab
400
- {
401
- int32_t n_vocab = 0;
402
- fin.read((char *) &n_vocab, sizeof(n_vocab));
403
-
404
- if (n_vocab != model.hparams.n_vocab) {
405
- fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
406
- __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
407
- return false;
408
- }
409
-
410
- std::string word;
411
- for (int i = 0; i < n_vocab; i++) {
412
- uint32_t len;
413
- fin.read((char *) &len, sizeof(len));
414
-
415
- word.resize(len);
416
- fin.read((char *) word.data(), len);
417
-
418
- vocab.token_to_id[word] = i;
419
- vocab.id_to_token[i] = word;
420
- }
421
- }
422
-
423
- // for the big tensors, we have the option to store the data in 16-bit floats
424
- // in order to save memory and also to speed up the computation
425
- const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
426
-
427
- auto & ctx = model.ctx;
428
-
429
- size_t ctx_size = 0;
430
-
431
- {
432
- const auto & hparams = model.hparams;
433
-
434
- const int n_embd = hparams.n_embd;
435
- const int n_layer = hparams.n_layer;
436
- const int n_ctx = hparams.n_ctx;
437
- const int n_vocab = hparams.n_vocab;
438
-
439
- ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_g
440
- ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_b
441
-
442
- ctx_size += n_vocab*n_embd*ggml_type_size(wtype); // wte
443
- ctx_size += n_ctx*n_embd*ggml_type_size(GGML_TYPE_F32); // wpe
444
-
445
- ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_g
446
- ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_b
447
-
448
- ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_g
449
- ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_b
450
-
451
- ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_size(wtype)); // c_attn_attn_w
452
- ctx_size += n_layer*( 3*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_attn_b
453
-
454
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_size(wtype)); // c_attn_proj_w
455
- ctx_size += n_layer*( n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_proj_b
456
-
457
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype)); // c_mlp_fc_w
458
- ctx_size += n_layer*( 4*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_fc_b
459
-
460
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype)); // c_mlp_proj_w
461
- ctx_size += n_layer*( n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_proj_b
462
-
463
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_k
464
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_v
465
-
466
- ctx_size += (6 + 12*n_layer)*256; // object overhead
467
-
468
- printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
469
- }
470
-
471
- // create the ggml context
472
- {
473
- struct ggml_init_params params = {
474
- .mem_size = ctx_size,
475
- .mem_buffer = NULL,
476
- };
477
-
478
- model.ctx = ggml_init(params);
479
- if (!model.ctx) {
480
- fprintf(stderr, "%s: ggml_init() failed\n", __func__);
481
- return false;
482
- }
483
- }
484
-
485
- // prepare memory for the weights
486
- {
487
- const auto & hparams = model.hparams;
488
-
489
- const int n_embd = hparams.n_embd;
490
- const int n_layer = hparams.n_layer;
491
- const int n_ctx = hparams.n_ctx;
492
- const int n_vocab = hparams.n_vocab;
493
-
494
- model.layers.resize(n_layer);
495
-
496
- model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
497
- model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
498
-
499
- model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
500
- model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
501
-
502
- // map by name
503
- model.tensors["model/ln_f/g"] = model.ln_f_g;
504
- model.tensors["model/ln_f/b"] = model.ln_f_b;
505
-
506
- model.tensors["model/wte"] = model.wte;
507
- model.tensors["model/wpe"] = model.wpe;
508
-
509
- for (int i = 0; i < n_layer; ++i) {
510
- auto & layer = model.layers[i];
511
-
512
- layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
513
- layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
514
-
515
- layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
516
- layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
517
-
518
- layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, 3*n_embd, n_embd);
519
- layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
520
-
521
- layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
522
- layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
523
-
524
- layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
525
- layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
526
-
527
- layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
528
- layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
529
-
530
- // map by name
531
- model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
532
- model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b;
533
-
534
- model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g;
535
- model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b;
536
-
537
- model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
538
- model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
539
-
540
- model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
541
- model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
542
-
543
- model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
544
- model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
545
-
546
- model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w_trans;
547
- model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
548
- }
549
- }
550
-
551
- // key + value memory
552
- {
553
- const auto & hparams = model.hparams;
554
-
555
- const int n_embd = hparams.n_embd;
556
- const int n_layer = hparams.n_layer;
557
- const int n_ctx = hparams.n_ctx;
558
-
559
- const int n_mem = n_layer*n_ctx;
560
- const int n_elements = n_embd*n_mem;
561
-
562
- model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
563
- model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
564
-
565
- const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
566
-
567
- printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
568
- }
569
-
570
- // load weights
571
- {
572
- size_t total_size = 0;
573
-
574
- while (true) {
575
- int32_t n_dims;
576
- int32_t length;
577
- int32_t ftype;
578
-
579
- fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
580
- fin.read(reinterpret_cast<char *>(&length), sizeof(length));
581
- fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
582
-
583
- if (fin.eof()) {
584
- break;
585
- }
586
-
587
- int32_t nelements = 1;
588
- int32_t ne[2] = { 1, 1 };
589
- for (int i = 0; i < n_dims; ++i) {
590
- fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
591
- nelements *= ne[i];
592
- }
593
-
594
- std::string name(length, 0);
595
- fin.read(&name[0], length);
596
-
597
- if (model.tensors.find(name.data()) == model.tensors.end()) {
598
- fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
599
- return false;
600
- }
601
-
602
- auto tensor = model.tensors[name.data()];
603
- if (ggml_nelements(tensor) != nelements) {
604
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
605
- return false;
606
- }
607
-
608
- if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
609
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
610
- __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
611
- return false;
612
- }
613
-
614
- const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
615
-
616
- if (nelements*bpe != ggml_nbytes(tensor)) {
617
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
618
- __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
619
- return false;
620
- }
621
-
622
- fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
623
-
624
- //printf("%24s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
625
- total_size += ggml_nbytes(tensor);
626
- }
627
-
628
- printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
629
- }
630
-
631
- fin.close();
632
-
633
- return true;
634
- }
635
-
636
- // evaluate the transformer
637
- //
638
- // - model: the model
639
- // - n_threads: number of threads to use
640
- // - n_past: the context size so far
641
- // - embd_inp: the embeddings of the tokens in the context
642
- // - embd_w: the predicted probabilities of the next token
643
- //
644
- bool gpt2_eval(
645
- const gpt2_model & model,
646
- const int n_threads,
647
- const int n_past,
648
- const std::vector<gpt_vocab::id> & embd_inp,
649
- std::vector<float> & embd_w,
650
- size_t & mem_per_token) {
651
- const int N = embd_inp.size();
652
-
653
- const auto & hparams = model.hparams;
654
-
655
- const int n_embd = hparams.n_embd;
656
- const int n_layer = hparams.n_layer;
657
- const int n_ctx = hparams.n_ctx;
658
- const int n_head = hparams.n_head;
659
- const int n_vocab = hparams.n_vocab;
660
-
661
- static size_t buf_size = 512u*1024*1024;
662
- static void * buf = malloc(buf_size);
663
-
664
- if (mem_per_token > 0 && mem_per_token*N > buf_size) {
665
- const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
666
- printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
667
-
668
- // reallocate
669
- buf_size = buf_size_new;
670
- buf = realloc(buf, buf_size);
671
- if (buf == nullptr) {
672
- fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
673
- return false;
674
- }
675
- }
676
-
677
- struct ggml_init_params params = {
678
- .mem_size = buf_size,
679
- .mem_buffer = buf,
680
- };
681
-
682
- struct ggml_context * ctx0 = ggml_init(params);
683
- struct ggml_cgraph gf = { .n_threads = n_threads };
684
-
685
- struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
686
- memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
687
-
688
- struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
689
- for (int i = 0; i < N; ++i) {
690
- ((int32_t *) position->data)[i] = n_past + i;
691
- }
692
-
693
- // wte + wpe
694
- struct ggml_tensor * inpL =
695
- ggml_add(ctx0,
696
- ggml_get_rows(ctx0, model.wte, embd),
697
- ggml_get_rows(ctx0, model.wpe, position));
698
-
699
- for (int il = 0; il < n_layer; ++il) {
700
- struct ggml_tensor * cur;
701
-
702
- // norm
703
- {
704
- // [ 768, N]
705
- cur = ggml_norm(ctx0, inpL);
706
-
707
- // cur = ln_1_g*cur + ln_1_b
708
- // [ 768, N]
709
- cur = ggml_add(ctx0,
710
- ggml_mul(ctx0,
711
- ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
712
- cur),
713
- ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
714
- }
715
-
716
- // attn
717
- // [2304, 768] - model.layers[il].c_attn_attn_w
718
- // [2304, 1] - model.layers[il].c_attn_attn_b
719
- // [ 768, N] - cur (in)
720
- // [2304, N] - cur (out)
721
- //
722
- // cur = attn_w*cur + attn_b
723
- // [2304, N]
724
- {
725
- cur = ggml_mul_mat(ctx0,
726
- ggml_transpose(ctx0, model.layers[il].c_attn_attn_w),
727
- cur);
728
-
729
- cur = ggml_add(ctx0,
730
- ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
731
- cur);
732
- }
733
-
734
- // self-attention
735
- {
736
- struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
737
- struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
738
- struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
739
-
740
- // store key and value to memory
741
- if (N >= 1) {
742
- struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
743
- struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
744
-
745
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
746
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
747
- }
748
-
749
- // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
750
- // [64, N, 12]
751
- struct ggml_tensor * Q =
752
- ggml_permute(ctx0,
753
- ggml_cpy(ctx0,
754
- Qcur,
755
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
756
- 0, 2, 1, 3);
757
-
758
- // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
759
- // [64, n_past + N, 12]
760
- struct ggml_tensor * K =
761
- ggml_permute(ctx0,
762
- ggml_reshape_3d(ctx0,
763
- ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
764
- n_embd/n_head, n_head, n_past + N),
765
- 0, 2, 1, 3);
766
-
767
- // GG: flash attention
768
- //struct ggml_tensor * V =
769
- // ggml_cpy(ctx0,
770
- // ggml_permute(ctx0,
771
- // ggml_reshape_3d(ctx0,
772
- // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
773
- // n_embd/n_head, n_head, n_past + N),
774
- // 1, 2, 0, 3),
775
- // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
776
-
777
- //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
778
-
779
- // K * Q
780
- // [n_past + N, N, 12]
781
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
782
-
783
- // KQ_scaled = KQ / sqrt(n_embd/n_head)
784
- // [n_past + N, N, 12]
785
- struct ggml_tensor * KQ_scaled =
786
- ggml_scale(ctx0,
787
- KQ,
788
- ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
789
- );
790
-
791
- // KQ_masked = mask_past(KQ_scaled)
792
- // [n_past + N, N, 12]
793
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
794
-
795
- // KQ = soft_max(KQ_masked)
796
- // [n_past + N, N, 12]
797
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
798
-
799
- // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
800
- // [n_past + N, 64, 12]
801
- struct ggml_tensor * V_trans =
802
- ggml_permute(ctx0,
803
- ggml_reshape_3d(ctx0,
804
- ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
805
- n_embd/n_head, n_head, n_past + N),
806
- 1, 2, 0, 3);
807
-
808
- // KQV = transpose(V) * KQ_soft_max
809
- // [64, N, 12]
810
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
811
-
812
- // KQV_merged = KQV.permute(0, 2, 1, 3)
813
- // [64, 12, N]
814
- struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
815
-
816
- // cur = KQV_merged.contiguous().view(n_embd, N)
817
- // [768, N]
818
- cur = ggml_cpy(ctx0,
819
- KQV_merged,
820
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
821
- }
822
-
823
- // projection
824
- // [ 768, 768] - model.layers[il].c_attn_proj_w
825
- // [ 768, 1] - model.layers[il].c_attn_proj_b
826
- // [ 768, N] - cur (in)
827
- // [ 768, N] - cur (out)
828
- //
829
- // cur = proj_w*cur + proj_b
830
- // [768, N]
831
- {
832
- cur = ggml_mul_mat(ctx0,
833
- ggml_transpose(ctx0, model.layers[il].c_attn_proj_w),
834
- cur);
835
-
836
- cur = ggml_add(ctx0,
837
- ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
838
- cur);
839
- }
840
-
841
- // add the input
842
- cur = ggml_add(ctx0, cur, inpL);
843
-
844
- struct ggml_tensor * inpFF = cur;
845
-
846
- // feed-forward network
847
- {
848
- // norm
849
- {
850
- cur = ggml_norm(ctx0, inpFF);
851
-
852
- // cur = ln_2_g*cur + ln_2_b
853
- // [ 768, N]
854
- cur = ggml_add(ctx0,
855
- ggml_mul(ctx0,
856
- ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
857
- cur),
858
- ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
859
- }
860
-
861
- // fully connected
862
- // [3072, 768] - model.layers[il].c_mlp_fc_w
863
- // [3072, 1] - model.layers[il].c_mlp_fc_b
864
- // [ 768, N] - cur (in)
865
- // [3072, N] - cur (out)
866
- //
867
- // cur = fc_w*cur + fc_b
868
- // [3072, N]
869
- cur = ggml_mul_mat(ctx0,
870
- ggml_transpose(ctx0, model.layers[il].c_mlp_fc_w),
871
- cur);
872
-
873
- cur = ggml_add(ctx0,
874
- ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
875
- cur);
876
-
877
- // GELU activation
878
- // [3072, N]
879
- cur = ggml_gelu(ctx0, cur);
880
-
881
- // projection
882
- // [ 768, 3072] - model.layers[il].c_mlp_proj_w
883
- // [ 768, 1] - model.layers[il].c_mlp_proj_b
884
- // [3072, N] - cur (in)
885
- // [ 768, N] - cur (out)
886
- //
887
- // cur = proj_w*cur + proj_b
888
- // [768, N]
889
- cur = ggml_mul_mat(ctx0,
890
- model.layers[il].c_mlp_proj_w_trans,
891
- cur);
892
-
893
- cur = ggml_add(ctx0,
894
- ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
895
- cur);
896
- }
897
-
898
- // input for next layer
899
- inpL = ggml_add(ctx0, cur, inpFF);
900
- }
901
-
902
- // norm
903
- {
904
- // [ 768, N]
905
- inpL = ggml_norm(ctx0, inpL);
906
-
907
- // inpL = ln_f_g*inpL + ln_f_b
908
- // [ 768, N]
909
- inpL = ggml_add(ctx0,
910
- ggml_mul(ctx0,
911
- ggml_repeat(ctx0, model.ln_f_g, inpL),
912
- inpL),
913
- ggml_repeat(ctx0, model.ln_f_b, inpL));
914
- }
915
-
916
- // inpL = WTE * inpL
917
- // [ 768, 50257] - model.wte
918
- // [ 768, N] - inpL
919
- inpL = ggml_mul_mat(ctx0, model.wte, inpL);
920
-
921
- // logits -> probs
922
- inpL = ggml_soft_max(ctx0, inpL);
923
-
924
- // run the computation
925
- ggml_build_forward_expand(&gf, inpL);
926
- ggml_graph_compute (ctx0, &gf);
927
-
928
- //if (n_past%100 == 0) {
929
- // ggml_graph_print (&gf);
930
- // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
931
- //}
932
-
933
- //embd_w.resize(n_vocab*N);
934
- //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
935
-
936
- // return result for just the last token
937
- embd_w.resize(n_vocab);
938
- memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
939
-
940
- if (mem_per_token == 0) {
941
- mem_per_token = ggml_used_mem(ctx0)/N;
942
- }
943
- //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
944
-
945
- ggml_free(ctx0);
946
-
947
- return true;
948
- }
949
-
950
- /////////////////////////////// GPT-2 END ////////////////////////////////
951
 
952
  constexpr int N_THREAD = 8;
953
 
954
- struct gpt2_state {
955
- std::string prompt_base = R"(Hello, how are you?
956
- I'm fine, thanks. How are you?
957
- Thanks, I'm fine too. What are you doing?
958
- I'm just sitting here.
959
- It's a lovely day, isn't it?
960
- Yes, it is.
961
- Did you know that I'm a robot?
962
- I wasn't aware of that.
963
- )";
964
-
965
- std::mt19937 rng;
966
-
967
- gpt_vocab vocab;
968
- gpt2_model model;
969
-
970
- int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
971
- int32_t n_predict = 32; // new tokens to predict
972
-
973
- // sampling parameters
974
- int32_t top_k = 40;
975
- float top_p = 0.9f;
976
- float temp = 1.0f;
977
- };
978
-
979
- struct gpt2_state g_gpt2;
980
-
981
- std::vector<float> g_pcmf32;
982
-
983
  std::vector<struct whisper_context *> g_contexts(4, nullptr);
984
 
985
  std::mutex g_mutex;
@@ -991,60 +27,18 @@ std::string g_text_to_speak = "";
991
  std::string g_status = "";
992
  std::string g_status_forced = "";
993
 
994
- std::string gpt2_gen_text(const std::string & prompt) {
995
- int n_past = 0;
996
-
997
- std::vector<float> embd_w;
998
-
999
- // tokenize the prompt
1000
- std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(g_gpt2.vocab, prompt);
1001
-
1002
- g_gpt2.n_predict = std::min(g_gpt2.n_predict, g_gpt2.model.hparams.n_ctx - (int) embd_inp.size());
1003
-
1004
- std::vector<gpt_vocab::id> embd = embd_inp;
1005
-
1006
- size_t mem_per_token = 3000000;
1007
-
1008
- std::string result;
1009
-
1010
- for (int i = embd.size(); i < embd_inp.size() + g_gpt2.n_predict; i++) {
1011
- // predict
1012
- if (embd.size() > 0) {
1013
- if (!gpt2_eval(g_gpt2.model, g_gpt2.n_threads, n_past, embd, embd_w, mem_per_token)) {
1014
- printf("gpt-2: failed to generate text\n");
1015
- return "";
1016
- }
1017
- }
1018
-
1019
- n_past += embd.size();
1020
- embd.clear();
1021
-
1022
- {
1023
- // sample next token
1024
- const int top_k = g_gpt2.top_k;
1025
- const float top_p = g_gpt2.top_p;
1026
- const float temp = g_gpt2.temp;
1027
-
1028
- const int n_vocab = g_gpt2.model.hparams.n_vocab;
1029
-
1030
- const gpt_vocab::id id = gpt_sample_top_k_top_p(g_gpt2.vocab, embd_w.data() + (embd_w.size() - n_vocab), top_k, top_p, temp, g_gpt2.rng);
1031
-
1032
- // add it to the context
1033
- embd.push_back(id);
1034
- }
1035
 
1036
- result += g_gpt2.vocab.id_to_token[embd[0]];
 
 
 
 
1037
 
1038
- // end of text token
1039
- if (embd.back() == 50256 ||
1040
- g_gpt2.vocab.id_to_token[embd.back()] == "." ||
1041
- g_gpt2.vocab.id_to_token[embd.back()] == "!" ||
1042
- g_gpt2.vocab.id_to_token[embd.back()] == "?") {
1043
- break;
1044
- }
1045
- }
1046
 
1047
- return result;
1048
  }
1049
 
1050
  void talk_set_status(const std::string & status) {
@@ -1072,26 +66,13 @@ void talk_main(size_t index) {
1072
 
1073
  wparams.language = "en";
1074
 
1075
- g_gpt2.rng = std::mt19937(time(NULL));
1076
-
1077
- // load the model
1078
- {
1079
- const int64_t t_start_us = ggml_time_us();
1080
-
1081
- if (!gpt2_model_load("gpt-2.bin", g_gpt2.model, g_gpt2.vocab)) {
1082
- fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, "gpt-2.bin");
1083
- return;
1084
- }
1085
-
1086
- const int64_t t_load_us = ggml_time_us() - t_start_us;
1087
-
1088
- printf("gpt-2: model loaded in %d ms\n", (int) (t_load_us/1000));
1089
- }
1090
 
1091
  printf("talk: using %d threads\n", N_THREAD);
1092
 
1093
  std::vector<float> pcmf32;
1094
 
 
1095
  auto & ctx = g_contexts[index];
1096
 
1097
  const int64_t step_samples = 2*WHISPER_SAMPLE_RATE;
@@ -1211,7 +192,7 @@ void talk_main(size_t index) {
1211
 
1212
  talk_set_status("'" + text_heard + "' - thinking how to respond ...");
1213
 
1214
- const std::vector<gpt_vocab::id> tokens = ::gpt_tokenize(g_gpt2.vocab, text_heard);
1215
 
1216
  printf("whisper: number of tokens: %d, '%s'\n", (int) tokens.size(), text_heard.c_str());
1217
 
@@ -1220,11 +201,11 @@ void talk_main(size_t index) {
1220
 
1221
  {
1222
  std::lock_guard<std::mutex> lock(g_mutex);
1223
- prompt_base = g_gpt2.prompt_base;
1224
  }
1225
 
1226
  if (tokens.size() > 0) {
1227
- text_to_speak = gpt2_gen_text(prompt_base + text_heard + "\n");
1228
  text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
1229
  text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n"));
1230
 
@@ -1245,7 +226,7 @@ void talk_main(size_t index) {
1245
  }
1246
  prompt_base += text_heard + "\n" + text_to_speak + "\n";
1247
  } else {
1248
- text_to_speak = gpt2_gen_text(prompt_base);
1249
  text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
1250
  text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n"));
1251
 
@@ -1269,13 +250,15 @@ void talk_main(size_t index) {
1269
  t_last = std::chrono::high_resolution_clock::now();
1270
  g_text_to_speak = text_to_speak;
1271
  g_pcmf32.clear();
1272
- g_gpt2.prompt_base = prompt_base;
1273
  }
1274
 
1275
  talk_set_status("speaking ...");
1276
  }
1277
  }
1278
 
 
 
1279
  if (index < g_contexts.size()) {
1280
  whisper_free(g_contexts[index]);
1281
  g_contexts[index] = nullptr;
@@ -1351,7 +334,7 @@ EMSCRIPTEN_BINDINGS(talk) {
1351
 
1352
  {
1353
  std::lock_guard<std::mutex> lock(g_mutex);
1354
- text_context = g_gpt2.prompt_base;
1355
  }
1356
 
1357
  return text_context;
@@ -1389,7 +372,7 @@ EMSCRIPTEN_BINDINGS(talk) {
1389
  emscripten::function("set_prompt", emscripten::optional_override([](const std::string & prompt) {
1390
  {
1391
  std::lock_guard<std::mutex> lock(g_mutex);
1392
- g_gpt2.prompt_base = prompt;
1393
  }
1394
  }));
1395
  }
 
1
  #include "ggml.h"
2
+ #include "gpt-2.h"
3
  #include "whisper.h"
4
 
5
  #include <emscripten.h>
6
  #include <emscripten/bind.h>
7
 
8
  #include <atomic>
 
9
  #include <cmath>
 
 
 
 
10
  #include <mutex>
11
  #include <string>
12
  #include <thread>
13
  #include <vector>
14
  #include <regex>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  constexpr int N_THREAD = 8;
17
 
18
+ struct gpt2_context * g_gpt2;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  std::vector<struct whisper_context *> g_contexts(4, nullptr);
20
 
21
  std::mutex g_mutex;
 
27
  std::string g_status = "";
28
  std::string g_status_forced = "";
29
 
30
+ std::vector<float> g_pcmf32;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ std::string to_timestamp(int64_t t) {
33
+ int64_t sec = t/100;
34
+ int64_t msec = t - sec*100;
35
+ int64_t min = sec/60;
36
+ sec = sec - min*60;
37
 
38
+ char buf[32];
39
+ snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int) min, (int) sec, (int) msec);
 
 
 
 
 
 
40
 
41
+ return std::string(buf);
42
  }
43
 
44
  void talk_set_status(const std::string & status) {
 
66
 
67
  wparams.language = "en";
68
 
69
+ g_gpt2 = gpt2_init("gpt-2.bin");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  printf("talk: using %d threads\n", N_THREAD);
72
 
73
  std::vector<float> pcmf32;
74
 
75
+ // whisper context
76
  auto & ctx = g_contexts[index];
77
 
78
  const int64_t step_samples = 2*WHISPER_SAMPLE_RATE;
 
192
 
193
  talk_set_status("'" + text_heard + "' - thinking how to respond ...");
194
 
195
+ const std::vector<gpt_vocab::id> tokens = gpt2_tokenize(g_gpt2, text_heard.c_str());
196
 
197
  printf("whisper: number of tokens: %d, '%s'\n", (int) tokens.size(), text_heard.c_str());
198
 
 
201
 
202
  {
203
  std::lock_guard<std::mutex> lock(g_mutex);
204
+ prompt_base = gpt2_get_prompt(g_gpt2);
205
  }
206
 
207
  if (tokens.size() > 0) {
208
+ text_to_speak = gpt2_gen_text(g_gpt2, (prompt_base + text_heard + "\n").c_str(), 32);
209
  text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
210
  text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n"));
211
 
 
226
  }
227
  prompt_base += text_heard + "\n" + text_to_speak + "\n";
228
  } else {
229
+ text_to_speak = gpt2_gen_text(g_gpt2, prompt_base.c_str(), 32);
230
  text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
231
  text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n"));
232
 
 
250
  t_last = std::chrono::high_resolution_clock::now();
251
  g_text_to_speak = text_to_speak;
252
  g_pcmf32.clear();
253
+ gpt2_set_prompt(g_gpt2, prompt_base.c_str());
254
  }
255
 
256
  talk_set_status("speaking ...");
257
  }
258
  }
259
 
260
+ gpt2_free(g_gpt2);
261
+
262
  if (index < g_contexts.size()) {
263
  whisper_free(g_contexts[index]);
264
  g_contexts[index] = nullptr;
 
334
 
335
  {
336
  std::lock_guard<std::mutex> lock(g_mutex);
337
+ text_context = gpt2_get_prompt(g_gpt2);
338
  }
339
 
340
  return text_context;
 
372
  emscripten::function("set_prompt", emscripten::optional_override([](const std::string & prompt) {
373
  {
374
  std::lock_guard<std::mutex> lock(g_mutex);
375
+ gpt2_set_prompt(g_gpt2, prompt.c_str());
376
  }
377
  }));
378
  }
examples/talk.wasm/gpt-2.cpp ADDED
@@ -0,0 +1,925 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ggml.h"
2
+ #include "gpt-2.h"
3
+
4
+ #include <cmath>
5
+ #include <cstdio>
6
+ #include <cstring>
7
+ #include <fstream>
8
+ #include <map>
9
+ #include <string>
10
+ #include <thread>
11
+ #include <vector>
12
+ #include <regex>
13
+ #include <random>
14
+
15
+ /////////////////////// GPT-2 BEGIN /////////////////////////
16
+
17
+ //
18
+ // Vocab utils
19
+ //
20
+
21
+ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
22
+ std::vector<std::string> words;
23
+
24
+ // first split the text into words
25
+ {
26
+ std::string str = text;
27
+ std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
28
+
29
+ std::regex re(pat);
30
+ std::smatch m;
31
+
32
+ while (std::regex_search(str, m, re)) {
33
+ for (auto x : m) {
34
+ words.push_back(x);
35
+ }
36
+ str = m.suffix();
37
+ }
38
+ }
39
+
40
+ // find the longest tokens that form the words:
41
+ std::vector<gpt_vocab::id> tokens;
42
+ for (const auto & word : words) {
43
+ if (word.size() == 0) continue;
44
+
45
+ int i = 0;
46
+ int n = word.size();
47
+ while (i < n) {
48
+ int j = n;
49
+ while (j > i) {
50
+ auto it = vocab.token_to_id.find(word.substr(i, j-i));
51
+ if (it != vocab.token_to_id.end()) {
52
+ tokens.push_back(it->second);
53
+ i = j;
54
+ break;
55
+ }
56
+ --j;
57
+ }
58
+ if (i == n) {
59
+ break;
60
+ }
61
+ if (j == i) {
62
+ auto sub = word.substr(i, 1);
63
+ if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
64
+ tokens.push_back(vocab.token_to_id.at(sub));
65
+ } else {
66
+ fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
67
+ }
68
+ ++i;
69
+ }
70
+ }
71
+ }
72
+
73
+ return tokens;
74
+ }
75
+
76
+ gpt_vocab::id gpt_sample_top_k_top_p(
77
+ const gpt_vocab & vocab,
78
+ const float * logits,
79
+ int top_k,
80
+ double top_p,
81
+ double temp,
82
+ std::mt19937 & rng) {
83
+ int n_logits = vocab.id_to_token.size();
84
+
85
+ std::vector<std::pair<double, gpt_vocab::id>> logits_id;
86
+ logits_id.reserve(n_logits);
87
+
88
+ for (int i = 0; i < n_logits; i++) {
89
+ logits_id.push_back(std::make_pair(logits[i], i));
90
+ }
91
+
92
+ // find the top K tokens
93
+ std::partial_sort(
94
+ logits_id.begin(),
95
+ logits_id.begin() + top_k, logits_id.end(),
96
+ [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
97
+ return a.first > b.first;
98
+ });
99
+
100
+ logits_id.resize(top_k);
101
+
102
+ // normalize
103
+ {
104
+ double sum = 0.0f;
105
+ for (int i = 0; i < (int)logits_id.size(); i++) {
106
+ sum += logits_id[i].first;
107
+ }
108
+
109
+ sum = 1.0/sum;
110
+ for (int i = 0; i < (int)logits_id.size(); i++) {
111
+ logits_id[i].first *= sum;
112
+ }
113
+ }
114
+
115
+ if (top_p < 1.0f) {
116
+ {
117
+ double cumsum = 0.0f;
118
+ for (int i = 0; i < top_k; i++) {
119
+ cumsum += logits_id[i].first;
120
+ if (cumsum >= top_p) {
121
+ logits_id.resize(i+1);
122
+ break;
123
+ }
124
+ }
125
+ }
126
+
127
+ // normalize again
128
+ {
129
+ double sum = 0.0f;
130
+ for (int i = 0; i < (int)logits_id.size(); i++) {
131
+ sum += logits_id[i].first;
132
+ }
133
+
134
+ sum = 1.0/sum;
135
+ for (int i = 0; i < (int)logits_id.size(); i++) {
136
+ logits_id[i].first *= sum;
137
+ }
138
+ }
139
+ }
140
+
141
+ //printf("\n");
142
+ //for (int i = 0; i < (int)logits_id.size(); i++) {
143
+ // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), logits_id[i].first);
144
+ //}
145
+ //exit(0);
146
+
147
+ // sample from the obtained distribution
148
+ std::vector<double> probs;
149
+ probs.reserve(logits_id.size());
150
+
151
+ for (int i = 0; i < (int) logits_id.size(); i++) {
152
+ probs.push_back(logits_id[i].first);
153
+ }
154
+
155
+ std::discrete_distribution<> dist(probs.begin(), probs.end());
156
+ int idx = dist(rng);
157
+
158
+ return logits_id[idx].second;
159
+ }
160
+
161
+ // default hparams (GPT-2 117M)
162
+ struct gpt2_hparams {
163
+ int32_t n_vocab = 50257;
164
+ int32_t n_ctx = 1024;
165
+ int32_t n_embd = 768;
166
+ int32_t n_head = 12;
167
+ int32_t n_layer = 12;
168
+ int32_t f16 = 1;
169
+ };
170
+
171
+ struct gpt2_layer {
172
+ // normalization
173
+ struct ggml_tensor * ln_1_g;
174
+ struct ggml_tensor * ln_1_b;
175
+
176
+ struct ggml_tensor * ln_2_g;
177
+ struct ggml_tensor * ln_2_b;
178
+
179
+ // attention
180
+ struct ggml_tensor * c_attn_attn_w;
181
+ struct ggml_tensor * c_attn_attn_b;
182
+
183
+ struct ggml_tensor * c_attn_proj_w;
184
+ struct ggml_tensor * c_attn_proj_b;
185
+
186
+ // mlp
187
+ struct ggml_tensor * c_mlp_fc_w;
188
+ struct ggml_tensor * c_mlp_fc_b;
189
+
190
+ struct ggml_tensor * c_mlp_proj_w_trans; // transposed for efficiency
191
+ struct ggml_tensor * c_mlp_proj_b;
192
+ };
193
+
194
+ struct gpt2_model {
195
+ gpt2_hparams hparams;
196
+
197
+ // normalization
198
+ struct ggml_tensor * ln_f_g;
199
+ struct ggml_tensor * ln_f_b;
200
+
201
+ struct ggml_tensor * wte; // position embedding
202
+ struct ggml_tensor * wpe; // token embedding
203
+
204
+ std::vector<gpt2_layer> layers;
205
+
206
+ // key + value memory
207
+ struct ggml_tensor * memory_k;
208
+ struct ggml_tensor * memory_v;
209
+
210
+ //
211
+ struct ggml_context * ctx;
212
+ std::map<std::string, struct ggml_tensor *> tensors;
213
+ };
214
+
215
+ // load the model's weights from a file
216
+ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
217
+ printf("%s: loading model from '%s'\n", __func__, fname.c_str());
218
+
219
+ auto fin = std::ifstream(fname, std::ios::binary);
220
+ if (!fin) {
221
+ fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
222
+ return false;
223
+ }
224
+
225
+ // verify magic
226
+ {
227
+ uint32_t magic;
228
+ fin.read((char *) &magic, sizeof(magic));
229
+ if (magic != 0x67676d6c) {
230
+ fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
231
+ return false;
232
+ }
233
+ }
234
+
235
+ // load hparams
236
+ {
237
+ auto & hparams = model.hparams;
238
+
239
+ fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
240
+ fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
241
+ fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
242
+ fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
243
+ fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
244
+ fin.read((char *) &hparams.f16, sizeof(hparams.f16));
245
+
246
+ printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
247
+ printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
248
+ printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
249
+ printf("%s: n_head = %d\n", __func__, hparams.n_head);
250
+ printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
251
+ printf("%s: f16 = %d\n", __func__, hparams.f16);
252
+ }
253
+
254
+ // load vocab
255
+ {
256
+ int32_t n_vocab = 0;
257
+ fin.read((char *) &n_vocab, sizeof(n_vocab));
258
+
259
+ if (n_vocab != model.hparams.n_vocab) {
260
+ fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
261
+ __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
262
+ return false;
263
+ }
264
+
265
+ std::string word;
266
+ for (int i = 0; i < n_vocab; i++) {
267
+ uint32_t len;
268
+ fin.read((char *) &len, sizeof(len));
269
+
270
+ word.resize(len);
271
+ fin.read((char *) word.data(), len);
272
+
273
+ vocab.token_to_id[word] = i;
274
+ vocab.id_to_token[i] = word;
275
+ }
276
+ }
277
+
278
+ // for the big tensors, we have the option to store the data in 16-bit floats
279
+ // in order to save memory and also to speed up the computation
280
+ const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
281
+
282
+ auto & ctx = model.ctx;
283
+
284
+ size_t ctx_size = 0;
285
+
286
+ {
287
+ const auto & hparams = model.hparams;
288
+
289
+ const int n_embd = hparams.n_embd;
290
+ const int n_layer = hparams.n_layer;
291
+ const int n_ctx = hparams.n_ctx;
292
+ const int n_vocab = hparams.n_vocab;
293
+
294
+ ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_g
295
+ ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_b
296
+
297
+ ctx_size += n_vocab*n_embd*ggml_type_size(wtype); // wte
298
+ ctx_size += n_ctx*n_embd*ggml_type_size(GGML_TYPE_F32); // wpe
299
+
300
+ ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_g
301
+ ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_b
302
+
303
+ ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_g
304
+ ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_b
305
+
306
+ ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_size(wtype)); // c_attn_attn_w
307
+ ctx_size += n_layer*( 3*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_attn_b
308
+
309
+ ctx_size += n_layer*(n_embd*n_embd*ggml_type_size(wtype)); // c_attn_proj_w
310
+ ctx_size += n_layer*( n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_proj_b
311
+
312
+ ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype)); // c_mlp_fc_w
313
+ ctx_size += n_layer*( 4*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_fc_b
314
+
315
+ ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype)); // c_mlp_proj_w
316
+ ctx_size += n_layer*( n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_proj_b
317
+
318
+ ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_k
319
+ ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_v
320
+
321
+ ctx_size += (6 + 12*n_layer)*256; // object overhead
322
+
323
+ printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
324
+ }
325
+
326
+ // create the ggml context
327
+ {
328
+ struct ggml_init_params params = {
329
+ .mem_size = ctx_size,
330
+ .mem_buffer = NULL,
331
+ };
332
+
333
+ model.ctx = ggml_init(params);
334
+ if (!model.ctx) {
335
+ fprintf(stderr, "%s: ggml_init() failed\n", __func__);
336
+ return false;
337
+ }
338
+ }
339
+
340
+ // prepare memory for the weights
341
+ {
342
+ const auto & hparams = model.hparams;
343
+
344
+ const int n_embd = hparams.n_embd;
345
+ const int n_layer = hparams.n_layer;
346
+ const int n_ctx = hparams.n_ctx;
347
+ const int n_vocab = hparams.n_vocab;
348
+
349
+ model.layers.resize(n_layer);
350
+
351
+ model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
352
+ model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
353
+
354
+ model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
355
+ model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
356
+
357
+ // map by name
358
+ model.tensors["model/ln_f/g"] = model.ln_f_g;
359
+ model.tensors["model/ln_f/b"] = model.ln_f_b;
360
+
361
+ model.tensors["model/wte"] = model.wte;
362
+ model.tensors["model/wpe"] = model.wpe;
363
+
364
+ for (int i = 0; i < n_layer; ++i) {
365
+ auto & layer = model.layers[i];
366
+
367
+ layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
368
+ layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
369
+
370
+ layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
371
+ layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
372
+
373
+ layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, 3*n_embd, n_embd);
374
+ layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
375
+
376
+ layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
377
+ layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
378
+
379
+ layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
380
+ layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
381
+
382
+ layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
383
+ layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
384
+
385
+ // map by name
386
+ model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
387
+ model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b;
388
+
389
+ model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g;
390
+ model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b;
391
+
392
+ model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
393
+ model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
394
+
395
+ model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
396
+ model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
397
+
398
+ model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
399
+ model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
400
+
401
+ model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w_trans;
402
+ model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
403
+ }
404
+ }
405
+
406
+ // key + value memory
407
+ {
408
+ const auto & hparams = model.hparams;
409
+
410
+ const int n_embd = hparams.n_embd;
411
+ const int n_layer = hparams.n_layer;
412
+ const int n_ctx = hparams.n_ctx;
413
+
414
+ const int n_mem = n_layer*n_ctx;
415
+ const int n_elements = n_embd*n_mem;
416
+
417
+ model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
418
+ model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
419
+
420
+ const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
421
+
422
+ printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
423
+ }
424
+
425
+ // load weights
426
+ {
427
+ size_t total_size = 0;
428
+
429
+ while (true) {
430
+ int32_t n_dims;
431
+ int32_t length;
432
+ int32_t ftype;
433
+
434
+ fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
435
+ fin.read(reinterpret_cast<char *>(&length), sizeof(length));
436
+ fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
437
+
438
+ if (fin.eof()) {
439
+ break;
440
+ }
441
+
442
+ int32_t nelements = 1;
443
+ int32_t ne[2] = { 1, 1 };
444
+ for (int i = 0; i < n_dims; ++i) {
445
+ fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
446
+ nelements *= ne[i];
447
+ }
448
+
449
+ std::string name(length, 0);
450
+ fin.read(&name[0], length);
451
+
452
+ if (model.tensors.find(name.data()) == model.tensors.end()) {
453
+ fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
454
+ return false;
455
+ }
456
+
457
+ auto tensor = model.tensors[name.data()];
458
+ if (ggml_nelements(tensor) != nelements) {
459
+ fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
460
+ return false;
461
+ }
462
+
463
+ if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
464
+ fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
465
+ __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
466
+ return false;
467
+ }
468
+
469
+ const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
470
+
471
+ if (nelements*bpe != ggml_nbytes(tensor)) {
472
+ fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
473
+ __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
474
+ return false;
475
+ }
476
+
477
+ fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
478
+
479
+ //printf("%24s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
480
+ total_size += ggml_nbytes(tensor);
481
+ }
482
+
483
+ printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
484
+ }
485
+
486
+ fin.close();
487
+
488
+ return true;
489
+ }
490
+
491
+ // evaluate the transformer
492
+ //
493
+ // - model: the model
494
+ // - n_threads: number of threads to use
495
+ // - n_past: the context size so far
496
+ // - embd_inp: the embeddings of the tokens in the context
497
+ // - embd_w: the predicted probabilities of the next token
498
+ //
499
+ bool gpt2_eval(
500
+ const gpt2_model & model,
501
+ const int n_threads,
502
+ const int n_past,
503
+ const std::vector<gpt_vocab::id> & embd_inp,
504
+ std::vector<float> & embd_w,
505
+ size_t & mem_per_token) {
506
+ const int N = embd_inp.size();
507
+
508
+ const auto & hparams = model.hparams;
509
+
510
+ const int n_embd = hparams.n_embd;
511
+ const int n_layer = hparams.n_layer;
512
+ const int n_ctx = hparams.n_ctx;
513
+ const int n_head = hparams.n_head;
514
+ const int n_vocab = hparams.n_vocab;
515
+
516
+ static size_t buf_size = 512u*1024*1024;
517
+ static void * buf = malloc(buf_size);
518
+
519
+ if (mem_per_token > 0 && mem_per_token*N > buf_size) {
520
+ const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
521
+ printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
522
+
523
+ // reallocate
524
+ buf_size = buf_size_new;
525
+ buf = realloc(buf, buf_size);
526
+ if (buf == nullptr) {
527
+ fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
528
+ return false;
529
+ }
530
+ }
531
+
532
+ struct ggml_init_params params = {
533
+ .mem_size = buf_size,
534
+ .mem_buffer = buf,
535
+ };
536
+
537
+ struct ggml_context * ctx0 = ggml_init(params);
538
+ struct ggml_cgraph gf = { .n_threads = n_threads };
539
+
540
+ struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
541
+ memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
542
+
543
+ struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
544
+ for (int i = 0; i < N; ++i) {
545
+ ((int32_t *) position->data)[i] = n_past + i;
546
+ }
547
+
548
+ // wte + wpe
549
+ struct ggml_tensor * inpL =
550
+ ggml_add(ctx0,
551
+ ggml_get_rows(ctx0, model.wte, embd),
552
+ ggml_get_rows(ctx0, model.wpe, position));
553
+
554
+ for (int il = 0; il < n_layer; ++il) {
555
+ struct ggml_tensor * cur;
556
+
557
+ // norm
558
+ {
559
+ // [ 768, N]
560
+ cur = ggml_norm(ctx0, inpL);
561
+
562
+ // cur = ln_1_g*cur + ln_1_b
563
+ // [ 768, N]
564
+ cur = ggml_add(ctx0,
565
+ ggml_mul(ctx0,
566
+ ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
567
+ cur),
568
+ ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
569
+ }
570
+
571
+ // attn
572
+ // [2304, 768] - model.layers[il].c_attn_attn_w
573
+ // [2304, 1] - model.layers[il].c_attn_attn_b
574
+ // [ 768, N] - cur (in)
575
+ // [2304, N] - cur (out)
576
+ //
577
+ // cur = attn_w*cur + attn_b
578
+ // [2304, N]
579
+ {
580
+ cur = ggml_mul_mat(ctx0,
581
+ ggml_transpose(ctx0, model.layers[il].c_attn_attn_w),
582
+ cur);
583
+
584
+ cur = ggml_add(ctx0,
585
+ ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
586
+ cur);
587
+ }
588
+
589
+ // self-attention
590
+ {
591
+ struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
592
+ struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
593
+ struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
594
+
595
+ // store key and value to memory
596
+ if (N >= 1) {
597
+ struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
598
+ struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
599
+
600
+ ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
601
+ ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
602
+ }
603
+
604
+ // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
605
+ // [64, N, 12]
606
+ struct ggml_tensor * Q =
607
+ ggml_permute(ctx0,
608
+ ggml_cpy(ctx0,
609
+ Qcur,
610
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
611
+ 0, 2, 1, 3);
612
+
613
+ // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
614
+ // [64, n_past + N, 12]
615
+ struct ggml_tensor * K =
616
+ ggml_permute(ctx0,
617
+ ggml_reshape_3d(ctx0,
618
+ ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
619
+ n_embd/n_head, n_head, n_past + N),
620
+ 0, 2, 1, 3);
621
+
622
+ // GG: flash attention
623
+ //struct ggml_tensor * V =
624
+ // ggml_cpy(ctx0,
625
+ // ggml_permute(ctx0,
626
+ // ggml_reshape_3d(ctx0,
627
+ // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
628
+ // n_embd/n_head, n_head, n_past + N),
629
+ // 1, 2, 0, 3),
630
+ // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
631
+
632
+ //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
633
+
634
+ // K * Q
635
+ // [n_past + N, N, 12]
636
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
637
+
638
+ // KQ_scaled = KQ / sqrt(n_embd/n_head)
639
+ // [n_past + N, N, 12]
640
+ struct ggml_tensor * KQ_scaled =
641
+ ggml_scale(ctx0,
642
+ KQ,
643
+ ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
644
+ );
645
+
646
+ // KQ_masked = mask_past(KQ_scaled)
647
+ // [n_past + N, N, 12]
648
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
649
+
650
+ // KQ = soft_max(KQ_masked)
651
+ // [n_past + N, N, 12]
652
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
653
+
654
+ // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
655
+ // [n_past + N, 64, 12]
656
+ struct ggml_tensor * V_trans =
657
+ ggml_permute(ctx0,
658
+ ggml_reshape_3d(ctx0,
659
+ ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
660
+ n_embd/n_head, n_head, n_past + N),
661
+ 1, 2, 0, 3);
662
+
663
+ // KQV = transpose(V) * KQ_soft_max
664
+ // [64, N, 12]
665
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
666
+
667
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
668
+ // [64, 12, N]
669
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
670
+
671
+ // cur = KQV_merged.contiguous().view(n_embd, N)
672
+ // [768, N]
673
+ cur = ggml_cpy(ctx0,
674
+ KQV_merged,
675
+ ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
676
+ }
677
+
678
+ // projection
679
+ // [ 768, 768] - model.layers[il].c_attn_proj_w
680
+ // [ 768, 1] - model.layers[il].c_attn_proj_b
681
+ // [ 768, N] - cur (in)
682
+ // [ 768, N] - cur (out)
683
+ //
684
+ // cur = proj_w*cur + proj_b
685
+ // [768, N]
686
+ {
687
+ cur = ggml_mul_mat(ctx0,
688
+ ggml_transpose(ctx0, model.layers[il].c_attn_proj_w),
689
+ cur);
690
+
691
+ cur = ggml_add(ctx0,
692
+ ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
693
+ cur);
694
+ }
695
+
696
+ // add the input
697
+ cur = ggml_add(ctx0, cur, inpL);
698
+
699
+ struct ggml_tensor * inpFF = cur;
700
+
701
+ // feed-forward network
702
+ {
703
+ // norm
704
+ {
705
+ cur = ggml_norm(ctx0, inpFF);
706
+
707
+ // cur = ln_2_g*cur + ln_2_b
708
+ // [ 768, N]
709
+ cur = ggml_add(ctx0,
710
+ ggml_mul(ctx0,
711
+ ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
712
+ cur),
713
+ ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
714
+ }
715
+
716
+ // fully connected
717
+ // [3072, 768] - model.layers[il].c_mlp_fc_w
718
+ // [3072, 1] - model.layers[il].c_mlp_fc_b
719
+ // [ 768, N] - cur (in)
720
+ // [3072, N] - cur (out)
721
+ //
722
+ // cur = fc_w*cur + fc_b
723
+ // [3072, N]
724
+ cur = ggml_mul_mat(ctx0,
725
+ ggml_transpose(ctx0, model.layers[il].c_mlp_fc_w),
726
+ cur);
727
+
728
+ cur = ggml_add(ctx0,
729
+ ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
730
+ cur);
731
+
732
+ // GELU activation
733
+ // [3072, N]
734
+ cur = ggml_gelu(ctx0, cur);
735
+
736
+ // projection
737
+ // [ 768, 3072] - model.layers[il].c_mlp_proj_w
738
+ // [ 768, 1] - model.layers[il].c_mlp_proj_b
739
+ // [3072, N] - cur (in)
740
+ // [ 768, N] - cur (out)
741
+ //
742
+ // cur = proj_w*cur + proj_b
743
+ // [768, N]
744
+ cur = ggml_mul_mat(ctx0,
745
+ model.layers[il].c_mlp_proj_w_trans,
746
+ cur);
747
+
748
+ cur = ggml_add(ctx0,
749
+ ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
750
+ cur);
751
+ }
752
+
753
+ // input for next layer
754
+ inpL = ggml_add(ctx0, cur, inpFF);
755
+ }
756
+
757
+ // norm
758
+ {
759
+ // [ 768, N]
760
+ inpL = ggml_norm(ctx0, inpL);
761
+
762
+ // inpL = ln_f_g*inpL + ln_f_b
763
+ // [ 768, N]
764
+ inpL = ggml_add(ctx0,
765
+ ggml_mul(ctx0,
766
+ ggml_repeat(ctx0, model.ln_f_g, inpL),
767
+ inpL),
768
+ ggml_repeat(ctx0, model.ln_f_b, inpL));
769
+ }
770
+
771
+ // inpL = WTE * inpL
772
+ // [ 768, 50257] - model.wte
773
+ // [ 768, N] - inpL
774
+ inpL = ggml_mul_mat(ctx0, model.wte, inpL);
775
+
776
+ // logits -> probs
777
+ inpL = ggml_soft_max(ctx0, inpL);
778
+
779
+ // run the computation
780
+ ggml_build_forward_expand(&gf, inpL);
781
+ ggml_graph_compute (ctx0, &gf);
782
+
783
+ //if (n_past%100 == 0) {
784
+ // ggml_graph_print (&gf);
785
+ // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
786
+ //}
787
+
788
+ //embd_w.resize(n_vocab*N);
789
+ //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
790
+
791
+ // return result for just the last token
792
+ embd_w.resize(n_vocab);
793
+ memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
794
+
795
+ if (mem_per_token == 0) {
796
+ mem_per_token = ggml_used_mem(ctx0)/N;
797
+ }
798
+ //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
799
+
800
+ ggml_free(ctx0);
801
+
802
+ return true;
803
+ }
804
+
805
+ /////////////////////////////// GPT-2 END ////////////////////////////////
806
+
807
+ constexpr int N_THREAD = 8;
808
+
809
+ struct gpt2_context {
810
+ std::string prompt_base = R"(Hello, how are you?
811
+ I'm fine, thanks. How are you?
812
+ Thanks, I'm fine too. What are you doing?
813
+ I'm just sitting here.
814
+ It's a lovely day, isn't it?
815
+ Yes, it is.
816
+ Did you know that I'm a robot?
817
+ I wasn't aware of that.
818
+ )";
819
+
820
+ std::mt19937 rng;
821
+
822
+ gpt_vocab vocab;
823
+ gpt2_model model;
824
+
825
+ int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
826
+
827
+ // sampling parameters
828
+ int32_t top_k = 40;
829
+ float top_p = 0.9f;
830
+ float temp = 1.0f;
831
+ };
832
+
833
+ struct gpt2_context * gpt2_init(const char * path_model) {
834
+ gpt2_context * ctx = new gpt2_context;
835
+
836
+ ctx->rng = std::mt19937(time(NULL));
837
+
838
+ // load the model
839
+ {
840
+ const int64_t t_start_us = ggml_time_us();
841
+
842
+ if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) {
843
+ fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, "gpt-2.bin");
844
+ return nullptr;
845
+ }
846
+
847
+ const int64_t t_load_us = ggml_time_us() - t_start_us;
848
+
849
+ printf("gpt-2: model loaded in %d ms\n", (int) (t_load_us/1000));
850
+ }
851
+
852
+ return ctx;
853
+ }
854
+
855
+ void gpt2_free(struct gpt2_context * ctx) {
856
+ delete ctx;
857
+ }
858
+
859
+ const char * gpt2_get_prompt(struct gpt2_context * ctx) {
860
+ return ctx->prompt_base.c_str();
861
+ }
862
+
863
+ void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt) {
864
+ ctx->prompt_base = prompt;
865
+ }
866
+
867
+ std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text) {
868
+ return ::gpt_tokenize(ctx->vocab, text);
869
+ }
870
+
871
+ std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens) {
872
+ int n_past = 0;
873
+
874
+ std::vector<float> embd_w;
875
+
876
+ // tokenize the prompt
877
+ std::vector<gpt_vocab::id> embd_inp = ::gpt2_tokenize(ctx, text);
878
+
879
+ int n_predict = std::min(max_tokens, ctx->model.hparams.n_ctx - (int) embd_inp.size());
880
+
881
+ std::vector<gpt_vocab::id> embd = embd_inp;
882
+
883
+ size_t mem_per_token = 3000000;
884
+
885
+ std::string result;
886
+
887
+ for (int i = embd.size(); i < embd_inp.size() + n_predict; i++) {
888
+ // predict
889
+ if (embd.size() > 0) {
890
+ if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {
891
+ printf("gpt-2: failed to generate text\n");
892
+ return "";
893
+ }
894
+ }
895
+
896
+ n_past += embd.size();
897
+ embd.clear();
898
+
899
+ {
900
+ // sample next token
901
+ const int top_k = ctx->top_k;
902
+ const float top_p = ctx->top_p;
903
+ const float temp = ctx->temp;
904
+
905
+ const int n_vocab = ctx->model.hparams.n_vocab;
906
+
907
+ const gpt_vocab::id id = gpt_sample_top_k_top_p(ctx->vocab, embd_w.data() + (embd_w.size() - n_vocab), top_k, top_p, temp, ctx->rng);
908
+
909
+ // add it to the context
910
+ embd.push_back(id);
911
+ }
912
+
913
+ result += ctx->vocab.id_to_token[embd[0]];
914
+
915
+ // end of text token
916
+ if (embd.back() == 50256 ||
917
+ ctx->vocab.id_to_token[embd.back()] == "." ||
918
+ ctx->vocab.id_to_token[embd.back()] == "!" ||
919
+ ctx->vocab.id_to_token[embd.back()] == "?") {
920
+ break;
921
+ }
922
+ }
923
+
924
+ return result;
925
+ }
examples/talk.wasm/gpt-2.h ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // TODO: Change to C-style API and move to ./examples for easy reuse.
4
+
5
+ #include <vector>
6
+ #include <map>
7
+ #include <string>
8
+
9
+ struct gpt_vocab {
10
+ using id = int32_t;
11
+ using token = std::string;
12
+
13
+ std::map<token, id> token_to_id;
14
+ std::map<id, token> id_to_token;
15
+ };
16
+
17
+ struct gpt2_context;
18
+
19
+ struct gpt2_context * gpt2_init(const char * path_model);
20
+ void gpt2_free(struct gpt2_context * ctx);
21
+
22
+ const char * gpt2_get_prompt(struct gpt2_context * ctx);
23
+ void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt);
24
+
25
+ std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text);
26
+
27
+ std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens);
examples/talk.wasm/index-tmpl.html CHANGED
@@ -504,7 +504,7 @@
504
 
505
  function startRecording() {
506
  if (!context) {
507
- context = new AudioContext({sampleRate: 16000});
508
  }
509
 
510
  Module.set_status("");
 
504
 
505
  function startRecording() {
506
  if (!context) {
507
+ context = new AudioContext({sampleRate: 16000, noiseSuppression: true});
508
  }
509
 
510
  Module.set_status("");