ggerganov commited on
Commit
30197de
·
unverified ·
1 Parent(s): 95fda6c

whisper : rename binaries + fix install (#2648)

Browse files

* whisper : rename binaries + fix install

* cont : try to fix ci

* cont : fix emscripten builds

Files changed (44) hide show
  1. README.md +26 -263
  2. examples/CMakeLists.txt +13 -36
  3. examples/bench/CMakeLists.txt +3 -1
  4. examples/bench/README.md +3 -6
  5. examples/{main → cli}/CMakeLists.txt +4 -2
  6. examples/{main → cli}/README.md +14 -4
  7. examples/{main/main.cpp → cli/cli.cpp} +0 -0
  8. examples/command/CMakeLists.txt +3 -2
  9. examples/command/README.md +8 -7
  10. examples/deprecation-warning/CMakeLists.txt +4 -0
  11. examples/deprecation-warning/README.md +17 -0
  12. examples/deprecation-warning/deprecation-warning.cpp +34 -0
  13. examples/generate-karaoke.sh +2 -2
  14. examples/livestream.sh +3 -3
  15. examples/server/CMakeLists.txt +3 -1
  16. examples/server/README.md +3 -3
  17. examples/server/server.cpp +2 -2
  18. examples/stream/CMakeLists.txt +3 -2
  19. examples/stream/README.md +6 -6
  20. examples/talk-llama/CMakeLists.txt +1 -2
  21. examples/talk-llama/README.md +9 -8
  22. examples/talk.wasm/CMakeLists.txt +0 -51
  23. examples/talk.wasm/README.md +0 -74
  24. examples/talk.wasm/emscripten.cpp +0 -368
  25. examples/talk.wasm/gpt-2.cpp +0 -808
  26. examples/talk.wasm/gpt-2.h +0 -21
  27. examples/talk.wasm/index-tmpl.html +0 -856
  28. examples/talk/.gitignore +0 -2
  29. examples/talk/CMakeLists.txt +0 -8
  30. examples/talk/README.md +0 -45
  31. examples/talk/eleven-labs.py +0 -80
  32. examples/talk/gpt-2.cpp +0 -809
  33. examples/talk/gpt-2.h +0 -21
  34. examples/talk/speak +0 -40
  35. examples/talk/speak.bat +0 -1
  36. examples/talk/speak.ps1 +0 -14
  37. examples/talk/talk.cpp +0 -376
  38. examples/twitch.sh +2 -2
  39. examples/yt-wsp.sh +1 -1
  40. scripts/bench-all.sh +3 -3
  41. scripts/bench-wts.sh +1 -1
  42. scripts/bench.py +1 -1
  43. scripts/quantize-all.sh +2 -2
  44. tests/run-tests.sh +1 -1
README.md CHANGED
@@ -53,18 +53,6 @@ On Apple Silicon, the inference runs fully on the GPU via Metal:
53
 
54
  https://github.com/ggerganov/whisper.cpp/assets/1991296/c82e8f86-60dc-49f2-b048-d2fdbd6b5225
55
 
56
- Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)
57
-
58
- ## Implementation details
59
-
60
- - The core tensor operations are implemented in C ([ggml.h](ggml/include/ggml.h) / [ggml.c](ggml/src/ggml.c))
61
- - The transformer model and the high-level C-style API are implemented in C++ ([whisper.h](include/whisper.h) / [whisper.cpp](src/whisper.cpp))
62
- - Sample usage is demonstrated in [main.cpp](examples/main)
63
- - Sample real-time audio transcription from the microphone is demonstrated in [stream.cpp](examples/stream)
64
- - Various other examples are available in the [examples](examples) folder
65
-
66
- The tensor operators are optimized heavily for Apple silicon CPUs. Depending on the computation size, Arm Neon SIMD intrinsics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since the Accelerate framework utilizes the special-purpose AMX coprocessor available in modern Apple products.
67
-
68
  ## Quick start
69
 
70
  First clone the repository:
@@ -85,135 +73,26 @@ Then, download one of the Whisper [models](models/README.md) converted in [`ggml
85
  sh ./models/download-ggml-model.sh base.en
86
  ```
87
 
88
- Now build the [main](examples/main) example and transcribe an audio file like this:
89
 
90
  ```bash
91
- # build the main example
92
  cmake -B build
93
  cmake --build build --config Release
94
 
95
  # transcribe an audio file
96
- ./build/bin/main -f samples/jfk.wav
97
  ```
98
 
99
  ---
100
 
101
- For a quick demo, simply run `make base.en`:
102
-
103
- ```text
104
- $ make -j base.en
105
-
106
- cc -I. -O3 -std=c11 -pthread -DGGML_USE_ACCELERATE -c ggml.c -o ggml.o
107
- c++ -I. -I./examples -O3 -std=c++11 -pthread -c whisper.cpp -o whisper.o
108
- c++ -I. -I./examples -O3 -std=c++11 -pthread examples/main/main.cpp whisper.o ggml.o -o main -framework Accelerate
109
- ./main -h
110
-
111
- usage: ./main [options] file0.wav file1.wav ...
112
-
113
- options:
114
- -h, --help [default] show this help message and exit
115
- -t N, --threads N [4 ] number of threads to use during computation
116
- -p N, --processors N [1 ] number of processors to use during computation
117
- -ot N, --offset-t N [0 ] time offset in milliseconds
118
- -on N, --offset-n N [0 ] segment index offset
119
- -d N, --duration N [0 ] duration of audio to process in milliseconds
120
- -mc N, --max-context N [-1 ] maximum number of text context tokens to store
121
- -ml N, --max-len N [0 ] maximum segment length in characters
122
- -sow, --split-on-word [false ] split on word rather than on token
123
- -bo N, --best-of N [5 ] number of best candidates to keep
124
- -bs N, --beam-size N [5 ] beam size for beam search
125
- -wt N, --word-thold N [0.01 ] word timestamp probability threshold
126
- -et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail
127
- -lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
128
- -debug, --debug-mode [false ] enable debug mode (eg. dump log_mel)
129
- -tr, --translate [false ] translate from source language to english
130
- -di, --diarize [false ] stereo audio diarization
131
- -tdrz, --tinydiarize [false ] enable tinydiarize (requires a tdrz model)
132
- -nf, --no-fallback [false ] do not use temperature fallback while decoding
133
- -otxt, --output-txt [false ] output result in a text file
134
- -ovtt, --output-vtt [false ] output result in a vtt file
135
- -osrt, --output-srt [false ] output result in a srt file
136
- -olrc, --output-lrc [false ] output result in a lrc file
137
- -owts, --output-words [false ] output script for generating karaoke video
138
- -fp, --font-path [/System/Library/Fonts/Supplemental/Courier New Bold.ttf] path to a monospace font for karaoke video
139
- -ocsv, --output-csv [false ] output result in a CSV file
140
- -oj, --output-json [false ] output result in a JSON file
141
- -ojf, --output-json-full [false ] include more information in the JSON file
142
- -of FNAME, --output-file FNAME [ ] output file path (without file extension)
143
- -ps, --print-special [false ] print special tokens
144
- -pc, --print-colors [false ] print colors
145
- -pp, --print-progress [false ] print progress
146
- -nt, --no-timestamps [false ] do not print timestamps
147
- -l LANG, --language LANG [en ] spoken language ('auto' for auto-detect)
148
- -dl, --detect-language [false ] exit after automatically detecting language
149
- --prompt PROMPT [ ] initial prompt
150
- -m FNAME, --model FNAME [models/ggml-base.en.bin] model path
151
- -f FNAME, --file FNAME [ ] input WAV file path
152
- -oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference
153
- -ls, --log-score [false ] log best decoder scores of tokens
154
- -ng, --no-gpu [false ] disable GPU
155
-
156
-
157
- sh ./models/download-ggml-model.sh base.en
158
- Downloading ggml model base.en ...
159
- ggml-base.en.bin 100%[========================>] 141.11M 6.34MB/s in 24s
160
- Done! Model 'base.en' saved in 'models/ggml-base.en.bin'
161
- You can now use it like this:
162
-
163
- $ ./main -m models/ggml-base.en.bin -f samples/jfk.wav
164
-
165
-
166
- ===============================================
167
- Running base.en on all samples in ./samples ...
168
- ===============================================
169
-
170
- ----------------------------------------------
171
- [+] Running base.en on samples/jfk.wav ... (run 'ffplay samples/jfk.wav' to listen)
172
- ----------------------------------------------
173
-
174
- whisper_init_from_file: loading model from 'models/ggml-base.en.bin'
175
- whisper_model_load: loading model
176
- whisper_model_load: n_vocab = 51864
177
- whisper_model_load: n_audio_ctx = 1500
178
- whisper_model_load: n_audio_state = 512
179
- whisper_model_load: n_audio_head = 8
180
- whisper_model_load: n_audio_layer = 6
181
- whisper_model_load: n_text_ctx = 448
182
- whisper_model_load: n_text_state = 512
183
- whisper_model_load: n_text_head = 8
184
- whisper_model_load: n_text_layer = 6
185
- whisper_model_load: n_mels = 80
186
- whisper_model_load: f16 = 1
187
- whisper_model_load: type = 2
188
- whisper_model_load: mem required = 215.00 MB (+ 6.00 MB per decoder)
189
- whisper_model_load: kv self size = 5.25 MB
190
- whisper_model_load: kv cross size = 17.58 MB
191
- whisper_model_load: adding 1607 extra tokens
192
- whisper_model_load: model ctx = 140.60 MB
193
- whisper_model_load: model size = 140.54 MB
194
-
195
- system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
196
-
197
- main: processing 'samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
198
-
199
-
200
- [00:00:00.000 --> 00:00:11.000] And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
201
-
202
-
203
- whisper_print_timings: fallbacks = 0 p / 0 h
204
- whisper_print_timings: load time = 113.81 ms
205
- whisper_print_timings: mel time = 15.40 ms
206
- whisper_print_timings: sample time = 11.58 ms / 27 runs ( 0.43 ms per run)
207
- whisper_print_timings: encode time = 266.60 ms / 1 runs ( 266.60 ms per run)
208
- whisper_print_timings: decode time = 66.11 ms / 27 runs ( 2.45 ms per run)
209
- whisper_print_timings: total time = 476.31 ms
210
- ```
211
 
212
  The command downloads the `base.en` model converted to custom `ggml` format and runs the inference on all `.wav` samples in the folder `samples`.
213
 
214
- For detailed usage instructions, run: `./main -h`
215
 
216
- Note that the [main](examples/main) example currently runs only with 16-bit WAV files, so make sure to convert your input before running the tool.
217
  For example, you can use `ffmpeg` like this:
218
 
219
  ```bash
@@ -271,7 +150,7 @@ cmake --build build --config Release
271
  ./build/bin/quantize models/ggml-base.en.bin models/ggml-base.en-q5_0.bin q5_0
272
 
273
  # run the examples as usual, specifying the quantized model file
274
- ./build/bin/main -m models/ggml-base.en-q5_0.bin ./samples/gb0.wav
275
  ```
276
 
277
  ## Core ML support
@@ -313,7 +192,7 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in
313
  - Run the examples as usual. For example:
314
 
315
  ```text
316
- $ ./main -m models/ggml-base.en.bin -f samples/jfk.wav
317
 
318
  ...
319
 
@@ -397,7 +276,7 @@ This can result in significant speedup in encoder performance. Here are the inst
397
  - Run the examples as usual. For example:
398
 
399
  ```text
400
- $ ./main -m models/ggml-base.en.bin -f samples/jfk.wav
401
 
402
  ...
403
 
@@ -473,7 +352,7 @@ cmake --build build -j --config Release
473
  Run the inference examples as usual, for example:
474
 
475
  ```
476
- ./build/bin/main -f samples/jfk.wav -m models/ggml-base.en.bin -t 8
477
  ```
478
 
479
  *Notes:*
@@ -481,38 +360,6 @@ Run the inference examples as usual, for example:
481
  - If you have trouble with Ascend NPU device, please create a issue with **[CANN]** prefix/tag.
482
  - If you run successfully with your Ascend NPU device, please help update the table `Verified devices`.
483
 
484
- ## Docker
485
-
486
- ### Prerequisites
487
-
488
- - Docker must be installed and running on your system.
489
- - Create a folder to store big models & intermediate files (ex. /whisper/models)
490
-
491
- ### Images
492
-
493
- We have two Docker images available for this project:
494
-
495
- 1. `ghcr.io/ggerganov/whisper.cpp:main`: This image includes the main executable file as well as `curl` and `ffmpeg`. (platforms: `linux/amd64`, `linux/arm64`)
496
- 2. `ghcr.io/ggerganov/whisper.cpp:main-cuda`: Same as `main` but compiled with CUDA support. (platforms: `linux/amd64`)
497
-
498
- ### Usage
499
-
500
- ```shell
501
- # download model and persist it in a local folder
502
- docker run -it --rm \
503
- -v path/to/models:/models \
504
- whisper.cpp:main "./models/download-ggml-model.sh base /models"
505
- # transcribe an audio file
506
- docker run -it --rm \
507
- -v path/to/models:/models \
508
- -v path/to/audios:/audios \
509
- whisper.cpp:main "./main -m /models/ggml-base.bin -f /audios/jfk.wav"
510
- # transcribe an audio file in samples folder
511
- docker run -it --rm \
512
- -v path/to/models:/models \
513
- whisper.cpp:main "./main -m /models/ggml-base.bin -f ./samples/jfk.wav"
514
- ```
515
-
516
  ## Installing with Conan
517
 
518
  You can install pre-built binaries for whisper.cpp or build it from source using [Conan](https://conan.io/). Use the following command:
@@ -527,89 +374,6 @@ For detailed instructions on how to use Conan, please refer to the [Conan docume
527
 
528
  - Inference only
529
 
530
- ## Another example
531
-
532
- Here is another example of transcribing a [3:24 min speech](https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg)
533
- in about half a minute on a MacBook M1 Pro, using `medium.en` model:
534
-
535
- <details>
536
- <summary>Expand to see the result</summary>
537
-
538
- ```text
539
- $ ./main -m models/ggml-medium.en.bin -f samples/gb1.wav -t 8
540
-
541
- whisper_init_from_file: loading model from 'models/ggml-medium.en.bin'
542
- whisper_model_load: loading model
543
- whisper_model_load: n_vocab = 51864
544
- whisper_model_load: n_audio_ctx = 1500
545
- whisper_model_load: n_audio_state = 1024
546
- whisper_model_load: n_audio_head = 16
547
- whisper_model_load: n_audio_layer = 24
548
- whisper_model_load: n_text_ctx = 448
549
- whisper_model_load: n_text_state = 1024
550
- whisper_model_load: n_text_head = 16
551
- whisper_model_load: n_text_layer = 24
552
- whisper_model_load: n_mels = 80
553
- whisper_model_load: f16 = 1
554
- whisper_model_load: type = 4
555
- whisper_model_load: mem required = 1720.00 MB (+ 43.00 MB per decoder)
556
- whisper_model_load: kv self size = 42.00 MB
557
- whisper_model_load: kv cross size = 140.62 MB
558
- whisper_model_load: adding 1607 extra tokens
559
- whisper_model_load: model ctx = 1462.35 MB
560
- whisper_model_load: model size = 1462.12 MB
561
-
562
- system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
563
-
564
- main: processing 'samples/gb1.wav' (3179750 samples, 198.7 sec), 8 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
565
-
566
-
567
- [00:00:00.000 --> 00:00:08.000] My fellow Americans, this day has brought terrible news and great sadness to our country.
568
- [00:00:08.000 --> 00:00:17.000] At nine o'clock this morning, Mission Control in Houston lost contact with our Space Shuttle Columbia.
569
- [00:00:17.000 --> 00:00:23.000] A short time later, debris was seen falling from the skies above Texas.
570
- [00:00:23.000 --> 00:00:29.000] The Columbia's lost. There are no survivors.
571
- [00:00:29.000 --> 00:00:32.000] On board was a crew of seven.
572
- [00:00:32.000 --> 00:00:39.000] Colonel Rick Husband, Lieutenant Colonel Michael Anderson, Commander Laurel Clark,
573
- [00:00:39.000 --> 00:00:48.000] Captain David Brown, Commander William McCool, Dr. Kultna Shavla, and Ilan Ramon,
574
- [00:00:48.000 --> 00:00:52.000] a colonel in the Israeli Air Force.
575
- [00:00:52.000 --> 00:00:58.000] These men and women assumed great risk in the service to all humanity.
576
- [00:00:58.000 --> 00:01:03.000] In an age when space flight has come to seem almost routine,
577
- [00:01:03.000 --> 00:01:07.000] it is easy to overlook the dangers of travel by rocket
578
- [00:01:07.000 --> 00:01:12.000] and the difficulties of navigating the fierce outer atmosphere of the Earth.
579
- [00:01:12.000 --> 00:01:18.000] These astronauts knew the dangers, and they faced them willingly,
580
- [00:01:18.000 --> 00:01:23.000] knowing they had a high and noble purpose in life.
581
- [00:01:23.000 --> 00:01:31.000] Because of their courage and daring and idealism, we will miss them all the more.
582
- [00:01:31.000 --> 00:01:36.000] All Americans today are thinking as well of the families of these men and women
583
- [00:01:36.000 --> 00:01:40.000] who have been given this sudden shock and grief.
584
- [00:01:40.000 --> 00:01:45.000] You're not alone. Our entire nation grieves with you,
585
- [00:01:45.000 --> 00:01:52.000] and those you love will always have the respect and gratitude of this country.
586
- [00:01:52.000 --> 00:01:56.000] The cause in which they died will continue.
587
- [00:01:56.000 --> 00:02:04.000] Mankind is led into the darkness beyond our world by the inspiration of discovery
588
- [00:02:04.000 --> 00:02:11.000] and the longing to understand. Our journey into space will go on.
589
- [00:02:11.000 --> 00:02:16.000] In the skies today, we saw destruction and tragedy.
590
- [00:02:16.000 --> 00:02:22.000] Yet farther than we can see, there is comfort and hope.
591
- [00:02:22.000 --> 00:02:29.000] In the words of the prophet Isaiah, "Lift your eyes and look to the heavens
592
- [00:02:29.000 --> 00:02:35.000] who created all these. He who brings out the starry hosts one by one
593
- [00:02:35.000 --> 00:02:39.000] and calls them each by name."
594
- [00:02:39.000 --> 00:02:46.000] Because of His great power and mighty strength, not one of them is missing.
595
- [00:02:46.000 --> 00:02:55.000] The same Creator who names the stars also knows the names of the seven souls we mourn today.
596
- [00:02:55.000 --> 00:03:01.000] The crew of the shuttle Columbia did not return safely to earth,
597
- [00:03:01.000 --> 00:03:05.000] yet we can pray that all are safely home.
598
- [00:03:05.000 --> 00:03:13.000] May God bless the grieving families, and may God continue to bless America.
599
- [00:03:13.000 --> 00:03:19.000] [Silence]
600
-
601
-
602
- whisper_print_timings: fallbacks = 1 p / 0 h
603
- whisper_print_timings: load time = 569.03 ms
604
- whisper_print_timings: mel time = 146.85 ms
605
- whisper_print_timings: sample time = 238.66 ms / 553 runs ( 0.43 ms per run)
606
- whisper_print_timings: encode time = 18665.10 ms / 9 runs ( 2073.90 ms per run)
607
- whisper_print_timings: decode time = 13090.93 ms / 549 runs ( 23.85 ms per run)
608
- whisper_print_timings: total time = 32733.52 ms
609
- ```
610
-
611
- </details>
612
-
613
  ## Real-time audio input example
614
 
615
  This is a naive example of performing real-time inference on audio from your microphone.
@@ -630,7 +394,7 @@ Adding the `--print-colors` argument will print the transcribed text using an ex
630
  to highlight words with high or low confidence:
631
 
632
  ```bash
633
- ./main -m models/ggml-base.en.bin -f samples/gb0.wav --print-colors
634
  ```
635
 
636
  <img width="965" alt="image" src="https://user-images.githubusercontent.com/1991296/197356445-311c8643-9397-4e5e-b46e-0b4b4daa2530.png">
@@ -640,7 +404,7 @@ to highlight words with high or low confidence:
640
  For example, to limit the line length to a maximum of 16 characters, simply add `-ml 16`:
641
 
642
  ```text
643
- $ ./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 16
644
 
645
  whisper_model_load: loading model from './models/ggml-base.en.bin'
646
  ...
@@ -664,7 +428,7 @@ main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 pr
664
  The `--max-len` argument can be used to obtain word-level timestamps. Simply use `-ml 1`:
665
 
666
  ```text
667
- $ ./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 1
668
 
669
  whisper_model_load: loading model from './models/ggml-base.en.bin'
670
  ...
@@ -711,7 +475,7 @@ Sample usage:
711
  ./models/download-ggml-model.sh small.en-tdrz
712
 
713
  # run as usual, adding the "-tdrz" command-line argument
714
- ./main -f ./samples/a13.wav -m ./models/ggml-small.en-tdrz.bin -tdrz
715
  ...
716
  main: processing './samples/a13.wav' (480000 samples, 30.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, tdrz = 1, timestamps = 1 ...
717
  ...
@@ -728,14 +492,14 @@ main: processing './samples/a13.wav' (480000 samples, 30.0 sec), 4 threads, 1 pr
728
 
729
  ## Karaoke-style movie generation (experimental)
730
 
731
- The [main](examples/main) example provides support for output of karaoke-style movies, where the
732
  currently pronounced word is highlighted. Use the `-wts` argument and run the generated bash script.
733
  This requires to have `ffmpeg` installed.
734
 
735
  Here are a few _"typical"_ examples:
736
 
737
  ```bash
738
- ./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -owts
739
  source ./samples/jfk.wav.wts
740
  ffplay ./samples/jfk.wav.mp4
741
  ```
@@ -745,7 +509,7 @@ https://user-images.githubusercontent.com/1991296/199337465-dbee4b5e-9aeb-48a3-b
745
  ---
746
 
747
  ```bash
748
- ./main -m ./models/ggml-base.en.bin -f ./samples/mm0.wav -owts
749
  source ./samples/mm0.wav.wts
750
  ffplay ./samples/mm0.wav.mp4
751
  ```
@@ -755,7 +519,7 @@ https://user-images.githubusercontent.com/1991296/199337504-cc8fd233-0cb7-4920-9
755
  ---
756
 
757
  ```bash
758
- ./main -m ./models/ggml-base.en.bin -f ./samples/gb0.wav -owts
759
  source ./samples/gb0.wav.wts
760
  ffplay ./samples/gb0.wav.mp4
761
  ```
@@ -780,7 +544,7 @@ https://user-images.githubusercontent.com/1991296/223206245-2d36d903-cf8e-4f09-8
780
  ## Benchmarks
781
 
782
  In order to have an objective comparison of the performance of the inference across different system configurations,
783
- use the [bench](examples/bench) tool. The tool simply runs the Encoder part of the model and prints how much time it
784
  took to execute it. The results are summarized in the following Github issue:
785
 
786
  [Benchmark results](https://github.com/ggerganov/whisper.cpp/issues/89)
@@ -843,13 +607,12 @@ Some of the examples are even ported to run in the browser using WebAssembly. Ch
843
 
844
  | Example | Web | Description |
845
  | --------------------------------------------------- | ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- |
846
- | [main](examples/main) | [whisper.wasm](examples/whisper.wasm) | Tool for translating and transcribing audio using Whisper |
847
- | [bench](examples/bench) | [bench.wasm](examples/bench.wasm) | Benchmark the performance of Whisper on your machine |
848
- | [stream](examples/stream) | [stream.wasm](examples/stream.wasm) | Real-time transcription of raw microphone capture |
849
- | [command](examples/command) | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic |
850
- | [wchess](examples/wchess) | [wchess.wasm](examples/wchess) | Voice-controlled chess |
851
- | [talk](examples/talk) | [talk.wasm](examples/talk.wasm) | Talk with a GPT-2 bot |
852
- | [talk-llama](examples/talk-llama) | | Talk with a LLaMA bot |
853
  | [whisper.objc](examples/whisper.objc) | | iOS mobile application using whisper.cpp |
854
  | [whisper.swiftui](examples/whisper.swiftui) | | SwiftUI iOS / macOS application using whisper.cpp |
855
  | [whisper.android](examples/whisper.android) | | Android mobile application using whisper.cpp |
@@ -857,7 +620,7 @@ Some of the examples are even ported to run in the browser using WebAssembly. Ch
857
  | [generate-karaoke.sh](examples/generate-karaoke.sh) | | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture |
858
  | [livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185) |
859
  | [yt-wsp.sh](examples/yt-wsp.sh) | | Download + transcribe and/or translate any VOD [(original)](https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818) |
860
- | [server](examples/server) | | HTTP transcription server with OAI-like API |
861
 
862
  ## [Discussions](https://github.com/ggerganov/whisper.cpp/discussions)
863
 
 
53
 
54
  https://github.com/ggerganov/whisper.cpp/assets/1991296/c82e8f86-60dc-49f2-b048-d2fdbd6b5225
55
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  ## Quick start
57
 
58
  First clone the repository:
 
73
  sh ./models/download-ggml-model.sh base.en
74
  ```
75
 
76
+ Now build the [whisper-cli](examples/cli) example and transcribe an audio file like this:
77
 
78
  ```bash
79
+ # build the project
80
  cmake -B build
81
  cmake --build build --config Release
82
 
83
  # transcribe an audio file
84
+ ./build/bin/whisper-cli -f samples/jfk.wav
85
  ```
86
 
87
  ---
88
 
89
+ For a quick demo, simply run `make base.en`.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  The command downloads the `base.en` model converted to custom `ggml` format and runs the inference on all `.wav` samples in the folder `samples`.
92
 
93
+ For detailed usage instructions, run: `./build/bin/whisper-cli -h`
94
 
95
+ Note that the [whisper-cli](examples/cli) example currently runs only with 16-bit WAV files, so make sure to convert your input before running the tool.
96
  For example, you can use `ffmpeg` like this:
97
 
98
  ```bash
 
150
  ./build/bin/quantize models/ggml-base.en.bin models/ggml-base.en-q5_0.bin q5_0
151
 
152
  # run the examples as usual, specifying the quantized model file
153
+ ./build/bin/whisper-cli -m models/ggml-base.en-q5_0.bin ./samples/gb0.wav
154
  ```
155
 
156
  ## Core ML support
 
192
  - Run the examples as usual. For example:
193
 
194
  ```text
195
+ $ ./build/bin/whisper-cli -m models/ggml-base.en.bin -f samples/jfk.wav
196
 
197
  ...
198
 
 
276
  - Run the examples as usual. For example:
277
 
278
  ```text
279
+ $ ./build/bin/whisper-cli -m models/ggml-base.en.bin -f samples/jfk.wav
280
 
281
  ...
282
 
 
352
  Run the inference examples as usual, for example:
353
 
354
  ```
355
+ ./build/bin/whisper-cli -f samples/jfk.wav -m models/ggml-base.en.bin -t 8
356
  ```
357
 
358
  *Notes:*
 
360
  - If you have trouble with Ascend NPU device, please create a issue with **[CANN]** prefix/tag.
361
  - If you run successfully with your Ascend NPU device, please help update the table `Verified devices`.
362
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
  ## Installing with Conan
364
 
365
  You can install pre-built binaries for whisper.cpp or build it from source using [Conan](https://conan.io/). Use the following command:
 
374
 
375
  - Inference only
376
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
  ## Real-time audio input example
378
 
379
  This is a naive example of performing real-time inference on audio from your microphone.
 
394
  to highlight words with high or low confidence:
395
 
396
  ```bash
397
+ ./build/bin/whisper-cli -m models/ggml-base.en.bin -f samples/gb0.wav --print-colors
398
  ```
399
 
400
  <img width="965" alt="image" src="https://user-images.githubusercontent.com/1991296/197356445-311c8643-9397-4e5e-b46e-0b4b4daa2530.png">
 
404
  For example, to limit the line length to a maximum of 16 characters, simply add `-ml 16`:
405
 
406
  ```text
407
+ $ ./build/bin/whisper-cli -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 16
408
 
409
  whisper_model_load: loading model from './models/ggml-base.en.bin'
410
  ...
 
428
  The `--max-len` argument can be used to obtain word-level timestamps. Simply use `-ml 1`:
429
 
430
  ```text
431
+ $ ./build/bin/whisper-cli -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 1
432
 
433
  whisper_model_load: loading model from './models/ggml-base.en.bin'
434
  ...
 
475
  ./models/download-ggml-model.sh small.en-tdrz
476
 
477
  # run as usual, adding the "-tdrz" command-line argument
478
+ ./build/bin/whisper-cli -f ./samples/a13.wav -m ./models/ggml-small.en-tdrz.bin -tdrz
479
  ...
480
  main: processing './samples/a13.wav' (480000 samples, 30.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, tdrz = 1, timestamps = 1 ...
481
  ...
 
492
 
493
  ## Karaoke-style movie generation (experimental)
494
 
495
+ The [whisper-cli](examples/cli) example provides support for output of karaoke-style movies, where the
496
  currently pronounced word is highlighted. Use the `-wts` argument and run the generated bash script.
497
  This requires to have `ffmpeg` installed.
498
 
499
  Here are a few _"typical"_ examples:
500
 
501
  ```bash
502
+ ./build/bin/whisper-cli -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -owts
503
  source ./samples/jfk.wav.wts
504
  ffplay ./samples/jfk.wav.mp4
505
  ```
 
509
  ---
510
 
511
  ```bash
512
+ ./build/bin/whisper-cli -m ./models/ggml-base.en.bin -f ./samples/mm0.wav -owts
513
  source ./samples/mm0.wav.wts
514
  ffplay ./samples/mm0.wav.mp4
515
  ```
 
519
  ---
520
 
521
  ```bash
522
+ ./build/bin/whisper-cli -m ./models/ggml-base.en.bin -f ./samples/gb0.wav -owts
523
  source ./samples/gb0.wav.wts
524
  ffplay ./samples/gb0.wav.mp4
525
  ```
 
544
  ## Benchmarks
545
 
546
  In order to have an objective comparison of the performance of the inference across different system configurations,
547
+ use the [whisper-bench](examples/bench) tool. The tool simply runs the Encoder part of the model and prints how much time it
548
  took to execute it. The results are summarized in the following Github issue:
549
 
550
  [Benchmark results](https://github.com/ggerganov/whisper.cpp/issues/89)
 
607
 
608
  | Example | Web | Description |
609
  | --------------------------------------------------- | ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- |
610
+ | [whisper-cli](examples/cli) | [whisper.wasm](examples/whisper.wasm) | Tool for translating and transcribing audio using Whisper |
611
+ | [whisper-bench](examples/bench) | [bench.wasm](examples/bench.wasm) | Benchmark the performance of Whisper on your machine |
612
+ | [whisper-stream](examples/stream) | [stream.wasm](examples/stream.wasm) | Real-time transcription of raw microphone capture |
613
+ | [whisper-command](examples/command) | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic |
614
+ | [whisper-server](examples/server) | | HTTP transcription server with OAI-like API |
615
+ | [whisper-talk-llama](examples/talk-llama) | | Talk with a LLaMA bot |
 
616
  | [whisper.objc](examples/whisper.objc) | | iOS mobile application using whisper.cpp |
617
  | [whisper.swiftui](examples/whisper.swiftui) | | SwiftUI iOS / macOS application using whisper.cpp |
618
  | [whisper.android](examples/whisper.android) | | Android mobile application using whisper.cpp |
 
620
  | [generate-karaoke.sh](examples/generate-karaoke.sh) | | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture |
621
  | [livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185) |
622
  | [yt-wsp.sh](examples/yt-wsp.sh) | | Download + transcribe and/or translate any VOD [(original)](https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818) |
623
+ | [wchess](examples/wchess) | [wchess.wasm](examples/wchess) | Voice-controlled chess |
624
 
625
  ## [Discussions](https://github.com/ggerganov/whisper.cpp/discussions)
626
 
examples/CMakeLists.txt CHANGED
@@ -97,52 +97,29 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
97
 
98
  if (EMSCRIPTEN)
99
  add_subdirectory(whisper.wasm)
100
- set_target_properties(libmain PROPERTIES FOLDER "libs")
101
  add_subdirectory(stream.wasm)
102
- set_target_properties(libstream PROPERTIES FOLDER "libs")
103
  add_subdirectory(command.wasm)
104
- set_target_properties(libcommand PROPERTIES FOLDER "libs")
105
- #add_subdirectory(talk.wasm)
106
- #set_target_properties(libtalk PROPERTIES FOLDER "libs")
107
  add_subdirectory(bench.wasm)
108
- set_target_properties(libbench PROPERTIES FOLDER "libs")
109
  elseif(CMAKE_JS_VERSION)
110
  add_subdirectory(addon.node)
111
- set_target_properties(addon.node PROPERTIES FOLDER "examples")
112
  else()
113
- add_subdirectory(main)
114
- set_target_properties(main PROPERTIES FOLDER "examples")
115
- if (WHISPER_SDL2)
116
- add_subdirectory(stream)
117
- set_target_properties(stream PROPERTIES FOLDER "examples")
118
- endif (WHISPER_SDL2)
119
- add_subdirectory(server)
120
- set_target_properties(server PROPERTIES FOLDER "examples")
121
- if (WHISPER_SDL2)
122
- add_subdirectory(command)
123
- set_target_properties(command PROPERTIES FOLDER "examples")
124
- endif (WHISPER_SDL2)
125
  add_subdirectory(bench)
126
- set_target_properties(bench PROPERTIES FOLDER "examples")
127
  add_subdirectory(quantize)
128
- set_target_properties(quantize PROPERTIES FOLDER "examples")
129
- if (WHISPER_SDL2)
130
- # TODO: disabled until update
131
- # https://github.com/ggerganov/whisper.cpp/issues/1818
132
- #add_subdirectory(talk)
133
- #set_target_properties(talk PROPERTIES FOLDER "examples")
134
- add_subdirectory(talk-llama)
135
- set_target_properties(talk-llama PROPERTIES FOLDER "examples")
136
- add_subdirectory(lsp)
137
- set_target_properties(lsp PROPERTIES FOLDER "examples")
138
- if (GGML_SYCL)
139
- add_subdirectory(sycl)
140
- set_target_properties(ls-sycl-device PROPERTIES FOLDER "examples")
141
- endif()
142
- endif (WHISPER_SDL2)
143
  endif()
144
 
145
  if (WHISPER_SDL2)
146
  add_subdirectory(wchess)
147
- set_target_properties(wchess PROPERTIES FOLDER "examples")
148
  endif (WHISPER_SDL2)
 
97
 
98
  if (EMSCRIPTEN)
99
  add_subdirectory(whisper.wasm)
 
100
  add_subdirectory(stream.wasm)
 
101
  add_subdirectory(command.wasm)
 
 
 
102
  add_subdirectory(bench.wasm)
 
103
  elseif(CMAKE_JS_VERSION)
104
  add_subdirectory(addon.node)
 
105
  else()
106
+ add_subdirectory(cli)
 
 
 
 
 
 
 
 
 
 
 
107
  add_subdirectory(bench)
108
+ add_subdirectory(server)
109
  add_subdirectory(quantize)
110
+ if (WHISPER_SDL2)
111
+ add_subdirectory(stream)
112
+ add_subdirectory(command)
113
+ add_subdirectory(talk-llama)
114
+ add_subdirectory(lsp)
115
+ if (GGML_SYCL)
116
+ add_subdirectory(sycl)
117
+ endif()
118
+ endif (WHISPER_SDL2)
119
+
120
+ add_subdirectory(deprecation-warning)
 
 
 
 
121
  endif()
122
 
123
  if (WHISPER_SDL2)
124
  add_subdirectory(wchess)
 
125
  endif (WHISPER_SDL2)
examples/bench/CMakeLists.txt CHANGED
@@ -1,6 +1,8 @@
1
- set(TARGET bench)
2
  add_executable(${TARGET} bench.cpp)
3
 
4
  include(DefaultTargetOptions)
5
 
6
  target_link_libraries(${TARGET} PRIVATE whisper ${CMAKE_THREAD_LIBS_INIT})
 
 
 
1
+ set(TARGET whisper-bench)
2
  add_executable(${TARGET} bench.cpp)
3
 
4
  include(DefaultTargetOptions)
5
 
6
  target_link_libraries(${TARGET} PRIVATE whisper ${CMAKE_THREAD_LIBS_INIT})
7
+
8
+ install(TARGETS ${TARGET} RUNTIME)
examples/bench/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # bench
2
 
3
  A very basic tool for benchmarking the inference performance on your device. The tool simply runs the Encoder part of
4
  the transformer on some random audio data and records the execution time. This way we can have an objective comparison
@@ -7,11 +7,8 @@ of the performance of the model for various setups.
7
  Benchmark results are tracked in the following Github issue: https://github.com/ggerganov/whisper.cpp/issues/89
8
 
9
  ```bash
10
- # build the bench tool
11
- $ make bench
12
-
13
- # run it on the small.en model using 4 threads
14
- $ ./bench -m ./models/ggml-small.en.bin -t 4
15
 
16
  whisper_model_load: loading model from './models/ggml-small.en.bin'
17
  whisper_model_load: n_vocab = 51864
 
1
+ # whisper.cpp/examples/bench
2
 
3
  A very basic tool for benchmarking the inference performance on your device. The tool simply runs the Encoder part of
4
  the transformer on some random audio data and records the execution time. This way we can have an objective comparison
 
7
  Benchmark results are tracked in the following Github issue: https://github.com/ggerganov/whisper.cpp/issues/89
8
 
9
  ```bash
10
+ # run the bench too on the small.en model using 4 threads
11
+ $ ./build/bin/whisper-bench -m ./models/ggml-small.en.bin -t 4
 
 
 
12
 
13
  whisper_model_load: loading model from './models/ggml-small.en.bin'
14
  whisper_model_load: n_vocab = 51864
examples/{main → cli}/CMakeLists.txt RENAMED
@@ -1,6 +1,8 @@
1
- set(TARGET main)
2
- add_executable(${TARGET} main.cpp)
3
 
4
  include(DefaultTargetOptions)
5
 
6
  target_link_libraries(${TARGET} PRIVATE common whisper ${FFMPEG_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 
 
 
1
+ set(TARGET whisper-cli)
2
+ add_executable(${TARGET} cli.cpp)
3
 
4
  include(DefaultTargetOptions)
5
 
6
  target_link_libraries(${TARGET} PRIVATE common whisper ${FFMPEG_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
7
+
8
+ install(TARGETS ${TARGET} RUNTIME)
examples/{main → cli}/README.md RENAMED
@@ -1,12 +1,12 @@
1
- # main
2
 
3
  This is the main example demonstrating most of the functionality of the Whisper model.
4
  It can be used as a reference for using the `whisper.cpp` library in other projects.
5
 
6
  ```
7
- ./main -h
8
 
9
- usage: ./main [options] file0.wav file1.wav ...
10
 
11
  options:
12
  -h, --help [default] show this help message and exit
@@ -20,9 +20,12 @@ options:
20
  -sow, --split-on-word [false ] split on word rather than on token
21
  -bo N, --best-of N [5 ] number of best candidates to keep
22
  -bs N, --beam-size N [5 ] beam size for beam search
 
23
  -wt N, --word-thold N [0.01 ] word timestamp probability threshold
24
  -et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail
25
  -lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
 
 
26
  -debug, --debug-mode [false ] enable debug mode (eg. dump log_mel)
27
  -tr, --translate [false ] translate from source language to english
28
  -di, --diarize [false ] stereo audio diarization
@@ -38,16 +41,23 @@ options:
38
  -oj, --output-json [false ] output result in a JSON file
39
  -ojf, --output-json-full [false ] include more information in the JSON file
40
  -of FNAME, --output-file FNAME [ ] output file path (without file extension)
 
41
  -ps, --print-special [false ] print special tokens
42
  -pc, --print-colors [false ] print colors
43
  -pp, --print-progress [false ] print progress
44
  -nt, --no-timestamps [false ] do not print timestamps
45
  -l LANG, --language LANG [en ] spoken language ('auto' for auto-detect)
46
  -dl, --detect-language [false ] exit after automatically detecting language
47
- --prompt PROMPT [ ] initial prompt
48
  -m FNAME, --model FNAME [models/ggml-base.en.bin] model path
49
  -f FNAME, --file FNAME [ ] input WAV file path
50
  -oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference
 
51
  -ls, --log-score [false ] log best decoder scores of tokens
52
  -ng, --no-gpu [false ] disable GPU
 
 
 
 
 
53
  ```
 
1
+ # whisper.cpp/examples/cli
2
 
3
  This is the main example demonstrating most of the functionality of the Whisper model.
4
  It can be used as a reference for using the `whisper.cpp` library in other projects.
5
 
6
  ```
7
+ ./build/bin/whisper-cli -h
8
 
9
+ usage: ./build-pkg/bin/whisper-cli [options] file0.wav file1.wav ...
10
 
11
  options:
12
  -h, --help [default] show this help message and exit
 
20
  -sow, --split-on-word [false ] split on word rather than on token
21
  -bo N, --best-of N [5 ] number of best candidates to keep
22
  -bs N, --beam-size N [5 ] beam size for beam search
23
+ -ac N, --audio-ctx N [0 ] audio context size (0 - all)
24
  -wt N, --word-thold N [0.01 ] word timestamp probability threshold
25
  -et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail
26
  -lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
27
+ -tp, --temperature N [0.00 ] The sampling temperature, between 0 and 1
28
+ -tpi, --temperature-inc N [0.20 ] The increment of temperature, between 0 and 1
29
  -debug, --debug-mode [false ] enable debug mode (eg. dump log_mel)
30
  -tr, --translate [false ] translate from source language to english
31
  -di, --diarize [false ] stereo audio diarization
 
41
  -oj, --output-json [false ] output result in a JSON file
42
  -ojf, --output-json-full [false ] include more information in the JSON file
43
  -of FNAME, --output-file FNAME [ ] output file path (without file extension)
44
+ -np, --no-prints [false ] do not print anything other than the results
45
  -ps, --print-special [false ] print special tokens
46
  -pc, --print-colors [false ] print colors
47
  -pp, --print-progress [false ] print progress
48
  -nt, --no-timestamps [false ] do not print timestamps
49
  -l LANG, --language LANG [en ] spoken language ('auto' for auto-detect)
50
  -dl, --detect-language [false ] exit after automatically detecting language
51
+ --prompt PROMPT [ ] initial prompt (max n_text_ctx/2 tokens)
52
  -m FNAME, --model FNAME [models/ggml-base.en.bin] model path
53
  -f FNAME, --file FNAME [ ] input WAV file path
54
  -oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference
55
+ -dtw MODEL --dtw MODEL [ ] compute token-level timestamps
56
  -ls, --log-score [false ] log best decoder scores of tokens
57
  -ng, --no-gpu [false ] disable GPU
58
+ -fa, --flash-attn [false ] flash attention
59
+ --suppress-regex REGEX [ ] regular expression matching tokens to suppress
60
+ --grammar GRAMMAR [ ] GBNF grammar to guide decoding
61
+ --grammar-rule RULE [ ] top-level GBNF grammar rule name
62
+ --grammar-penalty N [100.0 ] scales down logits of nongrammar tokens
63
  ```
examples/{main/main.cpp → cli/cli.cpp} RENAMED
File without changes
examples/command/CMakeLists.txt CHANGED
@@ -1,9 +1,10 @@
1
  if (WHISPER_SDL2)
2
- # command
3
- set(TARGET command)
4
  add_executable(${TARGET} command.cpp)
5
 
6
  include(DefaultTargetOptions)
7
 
8
  target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
 
 
9
  endif ()
 
1
  if (WHISPER_SDL2)
2
+ set(TARGET whisper-command)
 
3
  add_executable(${TARGET} command.cpp)
4
 
5
  include(DefaultTargetOptions)
6
 
7
  target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
8
+
9
+ install(TARGETS ${TARGET} RUNTIME)
10
  endif ()
examples/command/README.md CHANGED
@@ -1,14 +1,14 @@
1
- # command
2
 
3
  This is a basic Voice Assistant example that accepts voice commands from the microphone.
4
  More info is available in [issue #171](https://github.com/ggerganov/whisper.cpp/issues/171).
5
 
6
  ```bash
7
  # Run with default arguments and small model
8
- ./command -m ./models/ggml-small.en.bin -t 8
9
 
10
  # On Raspberry Pi, use tiny or base models + "-ac 768" for better performance
11
- ./command -m ./models/ggml-tiny.en.bin -ac 768 -t 3 -c 0
12
  ```
13
 
14
  https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
@@ -23,10 +23,10 @@ Initial tests show that this approach might be extremely efficient in terms of p
23
 
24
  ```bash
25
  # Run in guided mode, the list of allowed commands is in commands.txt
26
- ./command -m ./models/ggml-base.en.bin -cmd ./examples/command/commands.txt
27
 
28
  # On Raspberry Pi, in guided mode you can use "-ac 128" for extra performance
29
- ./command -m ./models/ggml-tiny.en.bin -cmd ./examples/command/commands.txt -ac 128 -t 3 -c 0
30
  ```
31
 
32
  https://user-images.githubusercontent.com/1991296/207435352-8fc4ed3f-bde5-4555-9b8b-aeeb76bee969.mp4
@@ -34,7 +34,7 @@ https://user-images.githubusercontent.com/1991296/207435352-8fc4ed3f-bde5-4555-9
34
 
35
  ## Building
36
 
37
- The `command` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
38
 
39
  ```bash
40
  # Install SDL2
@@ -47,5 +47,6 @@ sudo dnf install SDL2 SDL2-devel
47
  # Install SDL2 on Mac OS
48
  brew install sdl2
49
 
50
- make command
 
51
  ```
 
1
+ # whisper.cpp/examples/command
2
 
3
  This is a basic Voice Assistant example that accepts voice commands from the microphone.
4
  More info is available in [issue #171](https://github.com/ggerganov/whisper.cpp/issues/171).
5
 
6
  ```bash
7
  # Run with default arguments and small model
8
+ ./whisper-command -m ./models/ggml-small.en.bin -t 8
9
 
10
  # On Raspberry Pi, use tiny or base models + "-ac 768" for better performance
11
+ ./whisper-command -m ./models/ggml-tiny.en.bin -ac 768 -t 3 -c 0
12
  ```
13
 
14
  https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
 
23
 
24
  ```bash
25
  # Run in guided mode, the list of allowed commands is in commands.txt
26
+ ./whisper-command -m ./models/ggml-base.en.bin -cmd ./examples/command/commands.txt
27
 
28
  # On Raspberry Pi, in guided mode you can use "-ac 128" for extra performance
29
+ ./whisper-command -m ./models/ggml-tiny.en.bin -cmd ./examples/command/commands.txt -ac 128 -t 3 -c 0
30
  ```
31
 
32
  https://user-images.githubusercontent.com/1991296/207435352-8fc4ed3f-bde5-4555-9b8b-aeeb76bee969.mp4
 
34
 
35
  ## Building
36
 
37
+ The `whisper-command` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
38
 
39
  ```bash
40
  # Install SDL2
 
47
  # Install SDL2 on Mac OS
48
  brew install sdl2
49
 
50
+ cmake -B build -DWHISPER_SDL2=ON
51
+ cmake --build build --config Release
52
  ```
examples/deprecation-warning/CMakeLists.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ add_executable(main ./deprecation-warning.cpp)
2
+ add_executable(bench ./deprecation-warning.cpp)
3
+ add_executable(stream ./deprecation-warning.cpp)
4
+ add_executable(command ./deprecation-warning.cpp)
examples/deprecation-warning/README.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Migration notice for binary filenames
2
+
3
+ > [!IMPORTANT]
4
+ [2024 Dec 20] Binaries have been renamed w/ a `whisper-` prefix. `main` is now `whisper-cli`, `server` is `whisper-server`, etc (https://github.com/ggerganov/whisper.cpp/pull/2648)
5
+
6
+ This migration was important, but it is a breaking change that may not always be immediately obvious to users.
7
+
8
+ Please update all scripts and workflows to use the new binary names.
9
+
10
+ | Old Filename | New Filename |
11
+ | ---- | ---- |
12
+ | main | whisper-cli |
13
+ | bench | whisper-bench |
14
+ | stream | whisper-stream |
15
+ | command | whisper-command |
16
+ | server | whisper-server |
17
+ | talk-llama | whisper-talk-llama |
examples/deprecation-warning/deprecation-warning.cpp ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Warns users that this filename was deprecated, and provides a link for more information.
2
+
3
+ #include <cstdio>
4
+ #include <string>
5
+
6
+ // Main
7
+ int main(int argc, char** argv) {
8
+ std::string filename = "main";
9
+ if (argc >= 1) {
10
+ filename = argv[0];
11
+ }
12
+
13
+ // Get only the program name from the full path
14
+ size_t pos = filename.find_last_of("/\\");
15
+ if (pos != std::string::npos) {
16
+ filename = filename.substr(pos+1);
17
+ }
18
+
19
+ // Append "whisper-" to the beginning of filename to get the replacemnt filename
20
+ std::string replacement_filename = "whisper-" + filename;
21
+
22
+ // The exception is if the filename is "main", then our replacement filename is "whisper-cli"
23
+ if (filename == "main") {
24
+ replacement_filename = "whisper-cli";
25
+ }
26
+
27
+ fprintf(stdout, "\n");
28
+ fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
29
+ fprintf(stdout, " Please use '%s' instead.\n", replacement_filename.c_str());
30
+ fprintf(stdout, " See https://github.com/ggerganov/whisper.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
31
+ fprintf(stdout, "\n");
32
+
33
+ return EXIT_FAILURE;
34
+ }
examples/generate-karaoke.sh CHANGED
@@ -11,7 +11,7 @@
11
  # Press Ctrl+C to stop recording
12
  #
13
 
14
- executable="./main"
15
  model="base.en"
16
  model_path="models/ggml-$model.bin"
17
 
@@ -46,7 +46,7 @@ ffmpeg -y -i ./rec.wav -ar 16000 -ac 1 -c:a pcm_s16le ./rec16.wav > /dev/null 2>
46
 
47
  # run Whisper
48
  echo "Processing ..."
49
- ./main -m models/ggml-base.en.bin rec16.wav -owts > /dev/null 2>&1
50
 
51
  # generate Karaoke video
52
  echo "Generating video ..."
 
11
  # Press Ctrl+C to stop recording
12
  #
13
 
14
+ executable="./build/bin/whisper-cli"
15
  model="base.en"
16
  model_path="models/ggml-$model.bin"
17
 
 
46
 
47
  # run Whisper
48
  echo "Processing ..."
49
+ ${executable} -m models/ggml-base.en.bin rec16.wav -owts > /dev/null 2>&1
50
 
51
  # generate Karaoke video
52
  echo "Generating video ..."
examples/livestream.sh CHANGED
@@ -14,7 +14,7 @@ model="base.en"
14
 
15
  check_requirements()
16
  {
17
- if ! command -v ./main &>/dev/null; then
18
  echo "whisper.cpp main executable is required (make)"
19
  exit 1
20
  fi
@@ -100,7 +100,7 @@ while [ $running -eq 1 ]; do
100
  err=$(cat /tmp/whisper-live.err | wc -l)
101
  done
102
 
103
- ./main -t 8 -m ./models/ggml-${model}.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1
104
 
105
  while [ $SECONDS -lt $((($i+1)*$step_s)) ]; do
106
  sleep 1
@@ -109,4 +109,4 @@ while [ $running -eq 1 ]; do
109
  done
110
 
111
  killall -v ffmpeg
112
- killall -v main
 
14
 
15
  check_requirements()
16
  {
17
+ if ! command -v ./build/bin/whisper-cli &>/dev/null; then
18
  echo "whisper.cpp main executable is required (make)"
19
  exit 1
20
  fi
 
100
  err=$(cat /tmp/whisper-live.err | wc -l)
101
  done
102
 
103
+ ./build/bin/whisper-cli -t 8 -m ./models/ggml-${model}.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1
104
 
105
  while [ $SECONDS -lt $((($i+1)*$step_s)) ]; do
106
  sleep 1
 
109
  done
110
 
111
  killall -v ffmpeg
112
+ killall -v whisper-cli
examples/server/CMakeLists.txt CHANGED
@@ -1,4 +1,4 @@
1
- set(TARGET server)
2
  add_executable(${TARGET} server.cpp httplib.h)
3
 
4
  include(DefaultTargetOptions)
@@ -8,3 +8,5 @@ target_link_libraries(${TARGET} PRIVATE common json_cpp whisper ${CMAKE_THREAD_L
8
  if (WIN32)
9
  target_link_libraries(${TARGET} PRIVATE ws2_32)
10
  endif()
 
 
 
1
+ set(TARGET whisper-server)
2
  add_executable(${TARGET} server.cpp httplib.h)
3
 
4
  include(DefaultTargetOptions)
 
8
  if (WIN32)
9
  target_link_libraries(${TARGET} PRIVATE ws2_32)
10
  endif()
11
+
12
+ install(TARGETS ${TARGET} RUNTIME)
examples/server/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # whisper.cpp http server
2
 
3
  Simple http server. WAV Files are passed to the inference model via http requests.
4
 
@@ -7,9 +7,9 @@ https://github.com/ggerganov/whisper.cpp/assets/1991296/e983ee53-8741-4eb5-9048-
7
  ## Usage
8
 
9
  ```
10
- ./server -h
11
 
12
- usage: ./bin/server [options]
13
 
14
  options:
15
  -h, --help [default] show this help message and exit
 
1
+ # whisper.cpp/examples/server
2
 
3
  Simple http server. WAV Files are passed to the inference model via http requests.
4
 
 
7
  ## Usage
8
 
9
  ```
10
+ ./build/bin/whisper-server -h
11
 
12
+ usage: ./build/bin/whisper-server [options]
13
 
14
  options:
15
  -h, --help [default] show this help message and exit
examples/server/server.cpp CHANGED
@@ -677,7 +677,8 @@ int main(int argc, char ** argv) {
677
  if (sparams.ffmpeg_converter) {
678
  // if file is not wav, convert to wav
679
  // write to temporary file
680
- const std::string temp_filename_base = std::tmpnam(nullptr);
 
681
  const std::string temp_filename = temp_filename_base + ".wav";
682
  std::ofstream temp_file{temp_filename, std::ios::binary};
683
  temp_file << audio_file.content;
@@ -711,7 +712,6 @@ int main(int argc, char ** argv) {
711
  }
712
  }
713
 
714
-
715
  printf("Successfully loaded %s\n", filename.c_str());
716
 
717
  // print system information
 
677
  if (sparams.ffmpeg_converter) {
678
  // if file is not wav, convert to wav
679
  // write to temporary file
680
+ //const std::string temp_filename_base = std::tmpnam(nullptr);
681
+ const std::string temp_filename_base = "whisper-server-tmp"; // TODO: this is a hack, remove when the mutext is removed
682
  const std::string temp_filename = temp_filename_base + ".wav";
683
  std::ofstream temp_file{temp_filename, std::ios::binary};
684
  temp_file << audio_file.content;
 
712
  }
713
  }
714
 
 
715
  printf("Successfully loaded %s\n", filename.c_str());
716
 
717
  // print system information
examples/stream/CMakeLists.txt CHANGED
@@ -1,9 +1,10 @@
1
  if (WHISPER_SDL2)
2
- # stream
3
- set(TARGET stream)
4
  add_executable(${TARGET} stream.cpp)
5
 
6
  include(DefaultTargetOptions)
7
 
8
  target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
 
 
9
  endif ()
 
1
  if (WHISPER_SDL2)
2
+ set(TARGET whisper-stream)
 
3
  add_executable(${TARGET} stream.cpp)
4
 
5
  include(DefaultTargetOptions)
6
 
7
  target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
8
+
9
+ install(TARGETS ${TARGET} RUNTIME)
10
  endif ()
examples/stream/README.md CHANGED
@@ -1,11 +1,11 @@
1
- # stream
2
 
3
  This is a naive example of performing real-time inference on audio from your microphone.
4
- The `stream` tool samples the audio every half a second and runs the transcription continously.
5
  More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
6
 
7
  ```bash
8
- ./build/bin/stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
9
  ```
10
 
11
  https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4
@@ -15,7 +15,7 @@ https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a
15
  Setting the `--step` argument to `0` enables the sliding window mode:
16
 
17
  ```bash
18
- ./build/bin/stream -m ./models/ggml-base.en.bin -t 6 --step 0 --length 30000 -vth 0.6
19
  ```
20
 
21
  In this mode, the tool will transcribe only after some speech activity is detected. A very
@@ -27,7 +27,7 @@ a transcription block that is suitable for parsing.
27
 
28
  ## Building
29
 
30
- The `stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
31
 
32
  ```bash
33
  # Install SDL2
@@ -43,7 +43,7 @@ brew install sdl2
43
  cmake -B build -DWHISPER_SDL2=ON
44
  cmake --build build --config Release
45
 
46
- ./build/bin/stream
47
  ```
48
 
49
  ## Web version
 
1
+ # whisper.cpp/examples/stream
2
 
3
  This is a naive example of performing real-time inference on audio from your microphone.
4
+ The `whisper-stream` tool samples the audio every half a second and runs the transcription continously.
5
  More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
6
 
7
  ```bash
8
+ ./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
9
  ```
10
 
11
  https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4
 
15
  Setting the `--step` argument to `0` enables the sliding window mode:
16
 
17
  ```bash
18
+ ./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 6 --step 0 --length 30000 -vth 0.6
19
  ```
20
 
21
  In this mode, the tool will transcribe only after some speech activity is detected. A very
 
27
 
28
  ## Building
29
 
30
+ The `whisper-stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
31
 
32
  ```bash
33
  # Install SDL2
 
43
  cmake -B build -DWHISPER_SDL2=ON
44
  cmake --build build --config Release
45
 
46
+ ./build/bin/whisper-stream
47
  ```
48
 
49
  ## Web version
examples/talk-llama/CMakeLists.txt CHANGED
@@ -1,6 +1,5 @@
1
  if (WHISPER_SDL2)
2
- # talk-llama
3
- set(TARGET talk-llama)
4
  add_executable(${TARGET} talk-llama.cpp
5
  llama.cpp
6
  llama-vocab.cpp
 
1
  if (WHISPER_SDL2)
2
+ set(TARGET whisper-talk-llama)
 
3
  add_executable(${TARGET} talk-llama.cpp
4
  llama.cpp
5
  llama-vocab.cpp
examples/talk-llama/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # talk-llama
2
 
3
  Talk with an LLaMA AI in your terminal
4
 
@@ -12,7 +12,7 @@ https://github.com/ggerganov/whisper.cpp/assets/1991296/d97a3788-bf2a-4756-9a43-
12
 
13
  ## Building
14
 
15
- The `talk-llama` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
16
 
17
  ```bash
18
  # Install SDL2
@@ -25,11 +25,12 @@ sudo dnf install SDL2 SDL2-devel
25
  # Install SDL2 on Mac OS
26
  brew install sdl2
27
 
28
- # Build the "talk-llama" executable
29
- make talk-llama
 
30
 
31
  # Run it
32
- ./talk-llama -mw ./models/ggml-small.en.bin -ml ../llama.cpp/models/llama-13b/ggml-model-q4_0.gguf -p "Georgi" -t 8
33
  ```
34
 
35
  - The `-mw` argument specifies the Whisper model that you would like to use. Recommended `base` or `small` for real-time experience
@@ -37,16 +38,16 @@ make talk-llama
37
 
38
  ## Session
39
 
40
- The `talk-llama` tool supports session management to enable more coherent and continuous conversations. By maintaining context from previous interactions, it can better understand and respond to user requests in a more natural way.
41
 
42
- To enable session support, use the `--session FILE` command line option when running the program. The `talk-llama` model state will be saved to the specified file after each interaction. If the file does not exist, it will be created. If the file exists, the model state will be loaded from it, allowing you to resume a previous session.
43
 
44
  This feature is especially helpful for maintaining context in long conversations or when interacting with the AI assistant across multiple sessions. It ensures that the assistant remembers the previous interactions and can provide more relevant and contextual responses.
45
 
46
  Example usage:
47
 
48
  ```bash
49
- ./talk-llama --session ./my-session-file -mw ./models/ggml-small.en.bin -ml ../llama.cpp/models/llama-13b/ggml-model-q4_0.gguf -p "Georgi" -t 8
50
  ```
51
 
52
  ## TTS
 
1
+ # whisper.cpp/examples/talk-llama
2
 
3
  Talk with an LLaMA AI in your terminal
4
 
 
12
 
13
  ## Building
14
 
15
+ The `whisper-talk-llama` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
16
 
17
  ```bash
18
  # Install SDL2
 
25
  # Install SDL2 on Mac OS
26
  brew install sdl2
27
 
28
+ # Build the "whisper-talk-llama" executable
29
+ cmake -B build -S . -DWHISPER_SDL2=ON
30
+ cmake --build build --config Release
31
 
32
  # Run it
33
+ ./build/bin/whisper-talk-llama -mw ./models/ggml-small.en.bin -ml ../llama.cpp/models/llama-13b/ggml-model-q4_0.gguf -p "Georgi" -t 8
34
  ```
35
 
36
  - The `-mw` argument specifies the Whisper model that you would like to use. Recommended `base` or `small` for real-time experience
 
38
 
39
  ## Session
40
 
41
+ The `whisper-talk-llama` tool supports session management to enable more coherent and continuous conversations. By maintaining context from previous interactions, it can better understand and respond to user requests in a more natural way.
42
 
43
+ To enable session support, use the `--session FILE` command line option when running the program. The `whisper-talk-llama` model state will be saved to the specified file after each interaction. If the file does not exist, it will be created. If the file exists, the model state will be loaded from it, allowing you to resume a previous session.
44
 
45
  This feature is especially helpful for maintaining context in long conversations or when interacting with the AI assistant across multiple sessions. It ensures that the assistant remembers the previous interactions and can provide more relevant and contextual responses.
46
 
47
  Example usage:
48
 
49
  ```bash
50
+ ./build/bin/whisper-talk-llama --session ./my-session-file -mw ./models/ggml-small.en.bin -ml ../llama.cpp/models/llama-13b/ggml-model-q4_0.gguf -p "Georgi" -t 8
51
  ```
52
 
53
  ## TTS
examples/talk.wasm/CMakeLists.txt DELETED
@@ -1,51 +0,0 @@
1
- #
2
- # libtalk
3
- #
4
-
5
- set(TARGET libtalk)
6
-
7
- add_executable(${TARGET}
8
- emscripten.cpp
9
- gpt-2.cpp
10
- )
11
-
12
- include(DefaultTargetOptions)
13
-
14
- target_link_libraries(${TARGET} PRIVATE
15
- whisper
16
- common
17
- )
18
-
19
- unset(EXTRA_FLAGS)
20
-
21
- if (WHISPER_WASM_SINGLE_FILE)
22
- set(EXTRA_FLAGS "-s SINGLE_FILE=1")
23
- message(STATUS "Embedding WASM inside talk.js")
24
-
25
- add_custom_command(
26
- TARGET ${TARGET} POST_BUILD
27
- COMMAND ${CMAKE_COMMAND} -E copy
28
- ${CMAKE_BINARY_DIR}/bin/libtalk.js
29
- ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/talk.wasm/talk.js
30
- )
31
- endif()
32
-
33
- set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
34
- --bind \
35
- -s USE_PTHREADS=1 \
36
- -s PTHREAD_POOL_SIZE=8 \
37
- -s INITIAL_MEMORY=1800MB \
38
- -s TOTAL_MEMORY=1800MB \
39
- -s FORCE_FILESYSTEM=1 \
40
- -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
41
- ${EXTRA_FLAGS} \
42
- ")
43
-
44
- #
45
- # talk.wasm
46
- #
47
-
48
- set(TARGET talk.wasm)
49
-
50
- configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
51
- configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/talk.wasm/README.md DELETED
@@ -1,74 +0,0 @@
1
- # talk.wasm
2
-
3
- Talk with an Artificial Intelligence in your browser:
4
-
5
- [https://user-images.githubusercontent.com/1991296/203411580-fedb4839-05e4-4474-8364-aaf1e9a9b615.mp4](https://user-images.githubusercontent.com/1991296/203845553-f7b44e13-9a15-4fc8-b518-ae8f4c6770fe.mp4)
6
-
7
- Online demo: https://whisper.ggerganov.com/talk/
8
-
9
- Terminal version: [examples/talk](/examples/talk)
10
-
11
- ## How it works?
12
-
13
- This demo leverages 2 modern neural network models to create a high-quality voice chat directly in your browser:
14
-
15
- - [OpenAI's Whisper](https://github.com/openai/whisper) speech recognition model is used to process your voice and understand what you are saying
16
- - Upon receiving some voice input, the AI generates a text response using [OpenAI's GPT-2](https://github.com/openai/gpt-2) language model
17
- - The AI then vocalizes the response using the browser's [Web Speech API](https://developer.mozilla.org/en-US/docs/Web/API/Web_Speech_API)
18
-
19
- The web page does the processing locally on your machine. The processing of these heavy neural network models in the
20
- browser is possible by implementing them efficiently in C/C++ and using the browser's WebAssembly SIMD capabilities for
21
- extra performance:
22
-
23
- - The Whisper C++ implementation is here: [whisper.h](/whisper.h) / [whisper.cpp](/whisper.cpp)
24
- - The GPT-2 C++ implementation is here: [gpt-2.h](gpt-2.h) / [gpt-2.cpp](gpt-2.cpp)
25
- - Both models use a custom tensor library implemented in C: [ggml.h](/ggml.h) / [ggml.c](/ggml.c)
26
- - The HTML/JS layer is here: [index-tmpl.html](index-tmpl.html)
27
- - The Emscripten bridge between C/C++ and JS is here: [emscripten.cpp](emscripten.cpp)
28
-
29
- In order to run the models, the web page first needs to download the model data which is about ~350 MB. The model data
30
- is then cached in your browser's cache and can be reused in future visits without downloading it again.
31
-
32
- ## Requirements
33
-
34
- In order to run this demo efficiently, you need to have the following:
35
-
36
- - Latest Chrome or Firefox browser (Safari is not supported)
37
- - Run this on a desktop or laptop with modern CPU (a mobile phone will likely not be good enough)
38
- - Speak phrases that are no longer than 10 seconds - this is the audio context of the AI
39
- - The web-page uses about 1.8GB of RAM
40
-
41
- Notice that this demo is using the smallest GPT-2 model, so the generated text responses are not always very good.
42
- Also, the prompting strategy can likely be improved to achieve better results.
43
-
44
- The demo is quite computationally heavy, so you need a fast CPU. It's not usual to run these transformer models in a
45
- browser. Typically, they run on powerful GPUs.
46
-
47
- Currently, mobile browsers do not support the Fixed-width SIMD WebAssembly capability, so you cannot run this demo
48
- on a phone or a tablet. Hopefully, in the near future this will become supported.
49
-
50
- ## Todo
51
-
52
- - Better UI (contributions are welcome)
53
- - Better GPT-2 prompting
54
-
55
- ## Build instructions
56
-
57
- ```bash
58
- # build using Emscripten (v3.1.2)
59
- git clone https://github.com/ggerganov/whisper.cpp
60
- cd whisper.cpp
61
- mkdir build-em && cd build-em
62
- emcmake cmake ..
63
- make -j
64
-
65
- # copy the produced page to your HTTP path
66
- cp bin/talk.wasm/* /path/to/html/
67
- cp bin/libtalk.worker.js /path/to/html/
68
- ```
69
-
70
- ## Feedback
71
-
72
- If you have any comments or ideas for improvement, please drop a comment in the following discussion:
73
-
74
- https://github.com/ggerganov/whisper.cpp/discussions/167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/talk.wasm/emscripten.cpp DELETED
@@ -1,368 +0,0 @@
1
- #include "ggml.h"
2
- #include "gpt-2.h"
3
- #include "whisper.h"
4
-
5
- #include <emscripten.h>
6
- #include <emscripten/bind.h>
7
-
8
- #include <atomic>
9
- #include <cmath>
10
- #include <mutex>
11
- #include <string>
12
- #include <thread>
13
- #include <vector>
14
- #include <regex>
15
-
16
- constexpr int N_THREAD = 8;
17
-
18
- struct gpt2_context * g_gpt2;
19
- std::vector<struct whisper_context *> g_contexts(4, nullptr);
20
-
21
- std::mutex g_mutex;
22
- std::thread g_worker;
23
- std::atomic<bool> g_running(false);
24
-
25
- bool g_force_speak = false;
26
- std::string g_text_to_speak = "";
27
- std::string g_status = "";
28
- std::string g_status_forced = "";
29
-
30
- std::vector<float> g_pcmf32;
31
-
32
- void talk_set_status(const std::string & status) {
33
- std::lock_guard<std::mutex> lock(g_mutex);
34
- g_status = status;
35
- }
36
-
37
- void talk_main(size_t index) {
38
- talk_set_status("loading data ...");
39
-
40
- struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
41
-
42
- wparams.n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
43
- wparams.offset_ms = 0;
44
- wparams.translate = false;
45
- wparams.no_context = true;
46
- wparams.single_segment = true;
47
- wparams.print_realtime = false;
48
- wparams.print_progress = false;
49
- wparams.print_timestamps = true;
50
- wparams.print_special = false;
51
-
52
- wparams.max_tokens = 32;
53
- wparams.audio_ctx = 768; // partial encoder context for better performance
54
-
55
- wparams.language = "en";
56
-
57
- g_gpt2 = gpt2_init("gpt-2.bin");
58
-
59
- printf("talk: using %d threads\n", wparams.n_threads);
60
-
61
- std::vector<float> pcmf32;
62
-
63
- // whisper context
64
- auto & ctx = g_contexts[index];
65
-
66
- const int64_t step_samples = 2*WHISPER_SAMPLE_RATE;
67
- const int64_t window_samples = 9*WHISPER_SAMPLE_RATE;
68
- const int64_t step_ms = (step_samples*1000)/WHISPER_SAMPLE_RATE;
69
-
70
- auto t_last = std::chrono::high_resolution_clock::now();
71
-
72
- talk_set_status("listening ...");
73
-
74
- while (g_running) {
75
-
76
- const auto t_now = std::chrono::high_resolution_clock::now();
77
- if (std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count() < step_ms) {
78
- {
79
- std::lock_guard<std::mutex> lock(g_mutex);
80
- g_pcmf32.clear();
81
- }
82
- std::this_thread::sleep_for(std::chrono::milliseconds(10));
83
- continue;
84
- }
85
-
86
- talk_set_status("listening ...");
87
-
88
- {
89
- std::unique_lock<std::mutex> lock(g_mutex);
90
-
91
- if (g_pcmf32.size() < step_samples) {
92
- lock.unlock();
93
-
94
- std::this_thread::sleep_for(std::chrono::milliseconds(10));
95
-
96
- continue;
97
- }
98
-
99
- pcmf32 = std::vector<float>(g_pcmf32.end() - std::min((int64_t) g_pcmf32.size(), window_samples), g_pcmf32.end());
100
- }
101
-
102
- // VAD: if energy in during last second is above threshold, then skip
103
- {
104
- float energy_all = 0.0f;
105
- float energy_1s = 0.0f;
106
-
107
- for (size_t i = 0; i < pcmf32.size(); i++) {
108
- energy_all += fabsf(pcmf32[i]);
109
-
110
- if (i >= pcmf32.size() - WHISPER_SAMPLE_RATE) {
111
- energy_1s += fabsf(pcmf32[i]);
112
- }
113
- }
114
-
115
- energy_all /= pcmf32.size();
116
- energy_1s /= WHISPER_SAMPLE_RATE;
117
-
118
- if (energy_1s > 0.1f*energy_all && !g_force_speak) {
119
- std::this_thread::sleep_for(std::chrono::milliseconds(10));
120
- continue;
121
- }
122
- }
123
-
124
- talk_set_status("processing audio (whisper)...");
125
-
126
- t_last = t_now;
127
-
128
- if (!g_force_speak) {
129
- const auto t_start = std::chrono::high_resolution_clock::now();
130
-
131
- int ret = whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size());
132
- if (ret != 0) {
133
- printf("whisper_full() failed: %d\n", ret);
134
- break;
135
- }
136
-
137
- const auto t_end = std::chrono::high_resolution_clock::now();
138
-
139
- printf("whisper_full() returned %d in %f seconds\n", ret, std::chrono::duration<double>(t_end - t_start).count());
140
- }
141
-
142
- {
143
- std::string text_heard;
144
-
145
- if (!g_force_speak) {
146
- const int n_segments = whisper_full_n_segments(ctx);
147
- for (int i = n_segments - 1; i < n_segments; ++i) {
148
- const char * text = whisper_full_get_segment_text(ctx, i);
149
-
150
- const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
151
- const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
152
-
153
- printf ("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
154
-
155
- text_heard += text;
156
- }
157
- }
158
-
159
- g_force_speak = false;
160
-
161
- // remove text between brackets using regex
162
- {
163
- std::regex re("\\[.*?\\]");
164
- text_heard = std::regex_replace(text_heard, re, "");
165
- }
166
-
167
- // remove text between brackets using regex
168
- {
169
- std::regex re("\\(.*?\\)");
170
- text_heard = std::regex_replace(text_heard, re, "");
171
- }
172
-
173
- // remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
174
- text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
175
-
176
- // take first line
177
- text_heard = text_heard.substr(0, text_heard.find_first_of("\n"));
178
-
179
- // remove leading and trailing whitespace
180
- text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
181
- text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
182
-
183
- talk_set_status("'" + text_heard + "' - thinking how to respond (gpt-2) ...");
184
-
185
- const std::vector<gpt_vocab::id> tokens = gpt2_tokenize(g_gpt2, text_heard.c_str());
186
-
187
- printf("whisper: number of tokens: %d, '%s'\n", (int) tokens.size(), text_heard.c_str());
188
-
189
- std::string text_to_speak;
190
- std::string prompt_base;
191
-
192
- {
193
- std::lock_guard<std::mutex> lock(g_mutex);
194
- prompt_base = gpt2_get_prompt(g_gpt2);
195
- }
196
-
197
- if (tokens.size() > 0) {
198
- text_to_speak = gpt2_gen_text(g_gpt2, (prompt_base + text_heard + "\n").c_str(), 32);
199
- text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
200
- text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n"));
201
-
202
- std::lock_guard<std::mutex> lock(g_mutex);
203
-
204
- // remove first 2 lines of base prompt
205
- {
206
- const size_t pos = prompt_base.find_first_of("\n");
207
- if (pos != std::string::npos) {
208
- prompt_base = prompt_base.substr(pos + 1);
209
- }
210
- }
211
- {
212
- const size_t pos = prompt_base.find_first_of("\n");
213
- if (pos != std::string::npos) {
214
- prompt_base = prompt_base.substr(pos + 1);
215
- }
216
- }
217
- prompt_base += text_heard + "\n" + text_to_speak + "\n";
218
- } else {
219
- text_to_speak = gpt2_gen_text(g_gpt2, prompt_base.c_str(), 32);
220
- text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
221
- text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n"));
222
-
223
- std::lock_guard<std::mutex> lock(g_mutex);
224
-
225
- const size_t pos = prompt_base.find_first_of("\n");
226
- if (pos != std::string::npos) {
227
- prompt_base = prompt_base.substr(pos + 1);
228
- }
229
- prompt_base += text_to_speak + "\n";
230
- }
231
-
232
- printf("gpt-2: %s\n", text_to_speak.c_str());
233
-
234
- //printf("========================\n");
235
- //printf("gpt-2: prompt_base:\n'%s'\n", prompt_base.c_str());
236
- //printf("========================\n");
237
-
238
- {
239
- std::lock_guard<std::mutex> lock(g_mutex);
240
- t_last = std::chrono::high_resolution_clock::now();
241
- g_text_to_speak = text_to_speak;
242
- g_pcmf32.clear();
243
- gpt2_set_prompt(g_gpt2, prompt_base.c_str());
244
- }
245
-
246
- talk_set_status("speaking ...");
247
- }
248
- }
249
-
250
- gpt2_free(g_gpt2);
251
-
252
- if (index < g_contexts.size()) {
253
- whisper_free(g_contexts[index]);
254
- g_contexts[index] = nullptr;
255
- }
256
- }
257
-
258
- EMSCRIPTEN_BINDINGS(talk) {
259
- emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
260
- for (size_t i = 0; i < g_contexts.size(); ++i) {
261
- if (g_contexts[i] == nullptr) {
262
- g_contexts[i] = whisper_init_from_file_with_params(path_model.c_str(), whisper_context_default_params());
263
- if (g_contexts[i] != nullptr) {
264
- g_running = true;
265
- if (g_worker.joinable()) {
266
- g_worker.join();
267
- }
268
- g_worker = std::thread([i]() {
269
- talk_main(i);
270
- });
271
-
272
- return i + 1;
273
- } else {
274
- return (size_t) 0;
275
- }
276
- }
277
- }
278
-
279
- return (size_t) 0;
280
- }));
281
-
282
- emscripten::function("free", emscripten::optional_override([](size_t index) {
283
- if (g_running) {
284
- g_running = false;
285
- }
286
- }));
287
-
288
- emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
289
- --index;
290
-
291
- if (index >= g_contexts.size()) {
292
- return -1;
293
- }
294
-
295
- if (g_contexts[index] == nullptr) {
296
- return -2;
297
- }
298
-
299
- {
300
- std::lock_guard<std::mutex> lock(g_mutex);
301
- const int n = audio["length"].as<int>();
302
-
303
- emscripten::val heap = emscripten::val::module_property("HEAPU8");
304
- emscripten::val memory = heap["buffer"];
305
-
306
- g_pcmf32.resize(n);
307
-
308
- emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
309
- memoryView.call<void>("set", audio);
310
- }
311
-
312
- return 0;
313
- }));
314
-
315
- emscripten::function("force_speak", emscripten::optional_override([](size_t index) {
316
- {
317
- std::lock_guard<std::mutex> lock(g_mutex);
318
- g_force_speak = true;
319
- }
320
- }));
321
-
322
- emscripten::function("get_text_context", emscripten::optional_override([]() {
323
- std::string text_context;
324
-
325
- {
326
- std::lock_guard<std::mutex> lock(g_mutex);
327
- text_context = gpt2_get_prompt(g_gpt2);
328
- }
329
-
330
- return text_context;
331
- }));
332
-
333
- emscripten::function("get_text_to_speak", emscripten::optional_override([]() {
334
- std::string text_to_speak;
335
-
336
- {
337
- std::lock_guard<std::mutex> lock(g_mutex);
338
- text_to_speak = std::move(g_text_to_speak);
339
- }
340
-
341
- return text_to_speak;
342
- }));
343
-
344
- emscripten::function("get_status", emscripten::optional_override([]() {
345
- std::string status;
346
-
347
- {
348
- std::lock_guard<std::mutex> lock(g_mutex);
349
- status = g_status_forced.empty() ? g_status : g_status_forced;
350
- }
351
-
352
- return status;
353
- }));
354
-
355
- emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
356
- {
357
- std::lock_guard<std::mutex> lock(g_mutex);
358
- g_status_forced = status;
359
- }
360
- }));
361
-
362
- emscripten::function("set_prompt", emscripten::optional_override([](const std::string & prompt) {
363
- {
364
- std::lock_guard<std::mutex> lock(g_mutex);
365
- gpt2_set_prompt(g_gpt2, prompt.c_str());
366
- }
367
- }));
368
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/talk.wasm/gpt-2.cpp DELETED
@@ -1,808 +0,0 @@
1
- #include "ggml.h"
2
- #include "common-ggml.h"
3
-
4
- #include "gpt-2.h"
5
-
6
- #include <cmath>
7
- #include <cstdio>
8
- #include <cstring>
9
- #include <fstream>
10
- #include <map>
11
- #include <string>
12
- #include <thread>
13
- #include <vector>
14
- #include <regex>
15
- #include <random>
16
-
17
- /////////////////////// GPT-2 BEGIN /////////////////////////
18
-
19
- // default hparams (GPT-2 117M)
20
- struct gpt2_hparams {
21
- int32_t n_vocab = 50257;
22
- int32_t n_ctx = 1024;
23
- int32_t n_embd = 768;
24
- int32_t n_head = 12;
25
- int32_t n_layer = 12;
26
- int32_t ftype = 1;
27
- };
28
-
29
- struct gpt2_layer {
30
- // normalization
31
- struct ggml_tensor * ln_1_g;
32
- struct ggml_tensor * ln_1_b;
33
-
34
- struct ggml_tensor * ln_2_g;
35
- struct ggml_tensor * ln_2_b;
36
-
37
- // attention
38
- struct ggml_tensor * c_attn_attn_w;
39
- struct ggml_tensor * c_attn_attn_b;
40
-
41
- struct ggml_tensor * c_attn_proj_w;
42
- struct ggml_tensor * c_attn_proj_b;
43
-
44
- // mlp
45
- struct ggml_tensor * c_mlp_fc_w;
46
- struct ggml_tensor * c_mlp_fc_b;
47
-
48
- struct ggml_tensor * c_mlp_proj_w;
49
- struct ggml_tensor * c_mlp_proj_b;
50
- };
51
-
52
- struct gpt2_model {
53
- gpt2_hparams hparams;
54
-
55
- // normalization
56
- struct ggml_tensor * ln_f_g;
57
- struct ggml_tensor * ln_f_b;
58
-
59
- struct ggml_tensor * wte; // position embedding
60
- struct ggml_tensor * wpe; // token embedding
61
- struct ggml_tensor * lm_head; // language model head
62
-
63
- std::vector<gpt2_layer> layers;
64
-
65
- // key + value memory
66
- struct ggml_tensor * memory_k;
67
- struct ggml_tensor * memory_v;
68
-
69
- //
70
- struct ggml_context * ctx;
71
- std::map<std::string, struct ggml_tensor *> tensors;
72
- };
73
-
74
- // load the model's weights from a file
75
- bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
76
- printf("%s: loading model from '%s'\n", __func__, fname.c_str());
77
-
78
- auto fin = std::ifstream(fname, std::ios::binary);
79
- if (!fin) {
80
- fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
81
- return false;
82
- }
83
-
84
- // verify magic
85
- {
86
- uint32_t magic;
87
- fin.read((char *) &magic, sizeof(magic));
88
- if (magic != 0x67676d6c) {
89
- fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
90
- return false;
91
- }
92
- }
93
-
94
- // load hparams
95
- {
96
- auto & hparams = model.hparams;
97
-
98
- fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
99
- fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
100
- fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
101
- fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
102
- fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
103
- fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
104
-
105
- printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
106
- printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
107
- printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
108
- printf("%s: n_head = %d\n", __func__, hparams.n_head);
109
- printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
110
- printf("%s: ftype = %d\n", __func__, hparams.ftype);
111
- }
112
-
113
- // load vocab
114
- {
115
- int32_t n_vocab = 0;
116
- fin.read((char *) &n_vocab, sizeof(n_vocab));
117
-
118
- if (n_vocab != model.hparams.n_vocab) {
119
- fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
120
- __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
121
- return false;
122
- }
123
-
124
- std::string word;
125
- for (int i = 0; i < n_vocab; i++) {
126
- uint32_t len;
127
- fin.read((char *) &len, sizeof(len));
128
-
129
- word.resize(len);
130
- fin.read((char *) word.data(), len);
131
-
132
- vocab.token_to_id[word] = i;
133
- vocab.id_to_token[i] = word;
134
- }
135
- }
136
-
137
- // for the big tensors, we have the option to store the data in 16-bit floats or quantized
138
- // in order to save memory and also to speed up the computation
139
- ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
140
- if (wtype == GGML_TYPE_COUNT) {
141
- fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
142
- __func__, fname.c_str(), model.hparams.ftype);
143
- return false;
144
- }
145
-
146
- auto & ctx = model.ctx;
147
-
148
- size_t ctx_size = 0;
149
-
150
- {
151
- const auto & hparams = model.hparams;
152
-
153
- const int n_embd = hparams.n_embd;
154
- const int n_layer = hparams.n_layer;
155
- const int n_ctx = hparams.n_ctx;
156
- const int n_vocab = hparams.n_vocab;
157
-
158
- ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
159
- ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
160
-
161
- ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // wte
162
- ctx_size += n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe
163
- ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // lm_head
164
-
165
- ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
166
- ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
167
-
168
- ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
169
- ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
170
-
171
- ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w
172
- ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b
173
-
174
- ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
175
- ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b
176
-
177
- ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w
178
- ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b
179
-
180
- ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w
181
- ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_mlp_proj_b
182
-
183
- ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
184
- ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
185
-
186
- ctx_size += (6 + 12*n_layer)*256; // object overhead
187
-
188
- printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
189
- }
190
-
191
- // create the ggml context
192
- {
193
- struct ggml_init_params params = {
194
- /*.mem_size =*/ ctx_size,
195
- /*.mem_buffer =*/ NULL,
196
- /*.no_alloc =*/ false,
197
- };
198
-
199
- model.ctx = ggml_init(params);
200
- if (!model.ctx) {
201
- fprintf(stderr, "%s: ggml_init() failed\n", __func__);
202
- return false;
203
- }
204
- }
205
-
206
- // prepare memory for the weights
207
- {
208
- const auto & hparams = model.hparams;
209
-
210
- const int n_embd = hparams.n_embd;
211
- const int n_layer = hparams.n_layer;
212
- const int n_ctx = hparams.n_ctx;
213
- const int n_vocab = hparams.n_vocab;
214
-
215
- model.layers.resize(n_layer);
216
-
217
- model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
218
- model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
219
-
220
- model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
221
- model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
222
- model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
223
-
224
- // map by name
225
- model.tensors["model/ln_f/g"] = model.ln_f_g;
226
- model.tensors["model/ln_f/b"] = model.ln_f_b;
227
-
228
- model.tensors["model/wte"] = model.wte;
229
- model.tensors["model/wpe"] = model.wpe;
230
- model.tensors["model/lm_head"] = model.lm_head;
231
-
232
- for (int i = 0; i < n_layer; ++i) {
233
- auto & layer = model.layers[i];
234
-
235
- layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
236
- layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
237
-
238
- layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
239
- layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
240
-
241
- layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
242
- layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
243
-
244
- layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
245
- layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
246
-
247
- layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
248
- layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
249
-
250
- layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
251
- layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
252
-
253
- // map by name
254
- model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
255
- model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b;
256
-
257
- model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g;
258
- model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b;
259
-
260
- model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
261
- model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
262
-
263
- model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
264
- model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
265
-
266
- model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
267
- model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
268
-
269
- model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w;
270
- model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
271
- }
272
- }
273
-
274
- // key + value memory
275
- {
276
- const auto & hparams = model.hparams;
277
-
278
- const int n_embd = hparams.n_embd;
279
- const int n_layer = hparams.n_layer;
280
- const int n_ctx = hparams.n_ctx;
281
-
282
- const int n_mem = n_layer*n_ctx;
283
- const int n_elements = n_embd*n_mem;
284
-
285
- model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
286
- model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
287
-
288
- const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
289
-
290
- printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
291
- }
292
-
293
- // load weights
294
- {
295
- size_t total_size = 0;
296
-
297
- bool has_lm_head = false;
298
-
299
- while (true) {
300
- int32_t n_dims;
301
- int32_t length;
302
- int32_t ttype;
303
-
304
- fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
305
- fin.read(reinterpret_cast<char *>(&length), sizeof(length));
306
- fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
307
-
308
- if (fin.eof()) {
309
- break;
310
- }
311
-
312
- int32_t nelements = 1;
313
- int32_t ne[2] = { 1, 1 };
314
- for (int i = 0; i < n_dims; ++i) {
315
- fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
316
- nelements *= ne[i];
317
- }
318
-
319
- std::string name(length, 0);
320
- fin.read(&name[0], length);
321
-
322
- if (model.tensors.find(name.data()) == model.tensors.end()) {
323
- fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
324
- return false;
325
- }
326
-
327
- auto tensor = model.tensors[name.data()];
328
- if (ggml_nelements(tensor) != nelements) {
329
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
330
- return false;
331
- }
332
-
333
- if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
334
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
335
- __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
336
- return false;
337
- }
338
-
339
- // for debugging
340
- if (0) {
341
- printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
342
- }
343
-
344
- const size_t bpe = ggml_type_size(ggml_type(ttype));
345
-
346
- if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
347
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
348
- __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
349
- return false;
350
- }
351
-
352
- fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
353
-
354
- // GPT-2 models share the WTE tensor as the LM head
355
- if (name == "model/wte" && has_lm_head == false) {
356
- memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
357
- }
358
-
359
- if (name == "model/lm_head") {
360
- has_lm_head = true;
361
- }
362
-
363
- total_size += ggml_nbytes(tensor);
364
- }
365
-
366
- printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
367
- }
368
-
369
- fin.close();
370
-
371
- return true;
372
- }
373
-
374
- // evaluate the transformer
375
- //
376
- // - model: the model
377
- // - n_threads: number of threads to use
378
- // - n_past: the context size so far
379
- // - embd_inp: the embeddings of the tokens in the context
380
- // - embd_w: the predicted logits for the next token
381
- //
382
- bool gpt2_eval(
383
- const gpt2_model & model,
384
- const int n_threads,
385
- const int n_past,
386
- const std::vector<gpt_vocab::id> & embd_inp,
387
- std::vector<float> & embd_w,
388
- size_t & mem_per_token) {
389
- const int N = embd_inp.size();
390
-
391
- const auto & hparams = model.hparams;
392
-
393
- const int n_embd = hparams.n_embd;
394
- const int n_layer = hparams.n_layer;
395
- const int n_ctx = hparams.n_ctx;
396
- const int n_head = hparams.n_head;
397
- const int n_vocab = hparams.n_vocab;
398
-
399
- static size_t buf_size = 512u*1024*1024;
400
- static void * buf = malloc(buf_size);
401
-
402
- if (mem_per_token > 0 && mem_per_token*N > buf_size) {
403
- const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
404
- //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
405
-
406
- // reallocate
407
- buf_size = buf_size_new;
408
- buf = realloc(buf, buf_size);
409
- if (buf == nullptr) {
410
- fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
411
- return false;
412
- }
413
- }
414
-
415
- struct ggml_init_params params = {
416
- /*.mem_size =*/ buf_size,
417
- /*.mem_buffer =*/ buf,
418
- /*.no_alloc =*/ false,
419
- };
420
-
421
- struct ggml_context * ctx0 = ggml_init(params);
422
- struct ggml_cgraph gf = {};
423
-
424
- struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
425
- memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
426
-
427
- struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
428
- for (int i = 0; i < N; ++i) {
429
- ((int32_t *) position->data)[i] = n_past + i;
430
- }
431
-
432
- // wte + wpe
433
- struct ggml_tensor * inpL =
434
- ggml_add(ctx0,
435
- ggml_get_rows(ctx0, model.wte, embd),
436
- ggml_get_rows(ctx0, model.wpe, position));
437
-
438
- for (int il = 0; il < n_layer; ++il) {
439
- struct ggml_tensor * cur;
440
-
441
- // norm
442
- {
443
- // [ 768, N]
444
- cur = ggml_norm(ctx0, inpL, 1e-5f);
445
-
446
- // cur = ln_1_g*cur + ln_1_b
447
- // [ 768, N]
448
- cur = ggml_add(ctx0,
449
- ggml_mul(ctx0,
450
- ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
451
- cur),
452
- ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
453
- }
454
-
455
- // attn
456
- // [2304, 768] - model.layers[il].c_attn_attn_w
457
- // [2304, 1] - model.layers[il].c_attn_attn_b
458
- // [ 768, N] - cur (in)
459
- // [2304, N] - cur (out)
460
- //
461
- // cur = attn_w*cur + attn_b
462
- // [2304, N]
463
- {
464
- cur = ggml_mul_mat(ctx0,
465
- model.layers[il].c_attn_attn_w,
466
- cur);
467
-
468
- cur = ggml_add(ctx0,
469
- ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
470
- cur);
471
- }
472
-
473
- // self-attention
474
- {
475
- struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
476
- struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
477
- struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
478
-
479
- // store key and value to memory
480
- if (N >= 1) {
481
- struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
482
- struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
483
-
484
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
485
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
486
- }
487
-
488
- // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
489
- // [64, N, 12]
490
- struct ggml_tensor * Q =
491
- ggml_permute(ctx0,
492
- ggml_cpy(ctx0,
493
- Qcur,
494
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
495
- 0, 2, 1, 3);
496
-
497
- // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
498
- // [64, n_past + N, 12]
499
- struct ggml_tensor * K =
500
- ggml_permute(ctx0,
501
- ggml_reshape_3d(ctx0,
502
- ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
503
- n_embd/n_head, n_head, n_past + N),
504
- 0, 2, 1, 3);
505
-
506
- // GG: flash attention
507
- //struct ggml_tensor * V =
508
- // ggml_cpy(ctx0,
509
- // ggml_permute(ctx0,
510
- // ggml_reshape_3d(ctx0,
511
- // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
512
- // n_embd/n_head, n_head, n_past + N),
513
- // 1, 2, 0, 3),
514
- // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
515
-
516
- //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
517
-
518
- // K * Q
519
- // [n_past + N, N, 12]
520
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
521
-
522
- // KQ_scaled = KQ / sqrt(n_embd/n_head)
523
- // [n_past + N, N, 12]
524
- struct ggml_tensor * KQ_scaled =
525
- ggml_scale(ctx0,
526
- KQ,
527
- 1.0f/sqrt(float(n_embd)/n_head));
528
-
529
- // KQ_masked = mask_past(KQ_scaled)
530
- // [n_past + N, N, 12]
531
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
532
-
533
- // KQ = soft_max(KQ_masked)
534
- // [n_past + N, N, 12]
535
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
536
-
537
- // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
538
- // [n_past + N, 64, 12]
539
- struct ggml_tensor * V_trans =
540
- ggml_cpy(ctx0,
541
- ggml_permute(ctx0,
542
- ggml_reshape_3d(ctx0,
543
- ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
544
- n_embd/n_head, n_head, n_past + N),
545
- 1, 2, 0, 3),
546
- ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
547
-
548
- // KQV = transpose(V) * KQ_soft_max
549
- // [64, N, 12]
550
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
551
-
552
- // KQV_merged = KQV.permute(0, 2, 1, 3)
553
- // [64, 12, N]
554
- struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
555
-
556
- // cur = KQV_merged.contiguous().view(n_embd, N)
557
- // [768, N]
558
- cur = ggml_cpy(ctx0,
559
- KQV_merged,
560
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
561
- }
562
-
563
- // projection
564
- // [ 768, 768] - model.layers[il].c_attn_proj_w
565
- // [ 768, 1] - model.layers[il].c_attn_proj_b
566
- // [ 768, N] - cur (in)
567
- // [ 768, N] - cur (out)
568
- //
569
- // cur = proj_w*cur + proj_b
570
- // [768, N]
571
- {
572
- cur = ggml_mul_mat(ctx0,
573
- model.layers[il].c_attn_proj_w,
574
- cur);
575
-
576
- cur = ggml_add(ctx0,
577
- ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
578
- cur);
579
- }
580
-
581
- // add the input
582
- cur = ggml_add(ctx0, cur, inpL);
583
-
584
- struct ggml_tensor * inpFF = cur;
585
-
586
- // feed-forward network
587
- {
588
- // norm
589
- {
590
- cur = ggml_norm(ctx0, inpFF, 1e-5f);
591
-
592
- // cur = ln_2_g*cur + ln_2_b
593
- // [ 768, N]
594
- cur = ggml_add(ctx0,
595
- ggml_mul(ctx0,
596
- ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
597
- cur),
598
- ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
599
- }
600
-
601
- // fully connected
602
- // [3072, 768] - model.layers[il].c_mlp_fc_w
603
- // [3072, 1] - model.layers[il].c_mlp_fc_b
604
- // [ 768, N] - cur (in)
605
- // [3072, N] - cur (out)
606
- //
607
- // cur = fc_w*cur + fc_b
608
- // [3072, N]
609
- cur = ggml_mul_mat(ctx0,
610
- model.layers[il].c_mlp_fc_w,
611
- cur);
612
-
613
- cur = ggml_add(ctx0,
614
- ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
615
- cur);
616
-
617
- // GELU activation
618
- // [3072, N]
619
- cur = ggml_gelu(ctx0, cur);
620
-
621
- // projection
622
- // [ 768, 3072] - model.layers[il].c_mlp_proj_w
623
- // [ 768, 1] - model.layers[il].c_mlp_proj_b
624
- // [3072, N] - cur (in)
625
- // [ 768, N] - cur (out)
626
- //
627
- // cur = proj_w*cur + proj_b
628
- // [768, N]
629
- cur = ggml_mul_mat(ctx0,
630
- model.layers[il].c_mlp_proj_w,
631
- cur);
632
-
633
- cur = ggml_add(ctx0,
634
- ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
635
- cur);
636
- }
637
-
638
- // input for next layer
639
- inpL = ggml_add(ctx0, cur, inpFF);
640
- }
641
-
642
- // norm
643
- {
644
- // [ 768, N]
645
- inpL = ggml_norm(ctx0, inpL, 1e-5f);
646
-
647
- // inpL = ln_f_g*inpL + ln_f_b
648
- // [ 768, N]
649
- inpL = ggml_add(ctx0,
650
- ggml_mul(ctx0,
651
- ggml_repeat(ctx0, model.ln_f_g, inpL),
652
- inpL),
653
- ggml_repeat(ctx0, model.ln_f_b, inpL));
654
- }
655
-
656
- // inpL = WTE * inpL
657
- // [ 768, 50257] - model.lm_head
658
- // [ 768, N] - inpL
659
- inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
660
-
661
- // logits -> probs
662
- //inpL = ggml_soft_max(ctx0, inpL);
663
-
664
- // run the computation
665
- ggml_build_forward_expand (&gf, inpL);
666
- ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
667
-
668
- //if (n_past%100 == 0) {
669
- // ggml_graph_print (&gf);
670
- // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
671
- //}
672
-
673
- //embd_w.resize(n_vocab*N);
674
- //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
675
-
676
- // return result just for the last token
677
- embd_w.resize(n_vocab);
678
- memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
679
-
680
- if (mem_per_token == 0) {
681
- mem_per_token = ggml_used_mem(ctx0)/N;
682
- }
683
- //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
684
-
685
- ggml_free(ctx0);
686
-
687
- return true;
688
- }
689
-
690
- /////////////////////////////// GPT-2 END ////////////////////////////////
691
-
692
- constexpr int N_THREAD = 8;
693
-
694
- struct gpt2_context {
695
- std::string prompt_base = R"(Hello, how are you?
696
- I'm fine, thanks. How are you?
697
- Thanks, I'm fine too. What are you doing?
698
- I'm just sitting here.
699
- It's a lovely day, isn't it?
700
- Yes, it is. I love the weather this time of year.
701
- I wish it would rain a little bit.
702
- Me too.
703
- )";
704
-
705
- std::mt19937 rng;
706
-
707
- gpt_vocab vocab;
708
- gpt2_model model;
709
-
710
- int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
711
-
712
- // sampling parameters
713
- int32_t top_k = 5;
714
- float top_p = 0.9f;
715
- float temp = 1.0f;
716
- };
717
-
718
- struct gpt2_context * gpt2_init(const char * path_model) {
719
- gpt2_context * ctx = new gpt2_context;
720
-
721
- ctx->rng = std::mt19937(time(nullptr));
722
-
723
- // load the model
724
- {
725
- const int64_t t_start_us = ggml_time_us();
726
-
727
- if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) {
728
- fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, path_model);
729
- delete ctx;
730
- return nullptr;
731
- }
732
-
733
- const int64_t t_load_us = ggml_time_us() - t_start_us;
734
-
735
- printf("gpt-2: model loaded in %d ms\n", (int) (t_load_us/1000));
736
- }
737
-
738
- return ctx;
739
- }
740
-
741
- void gpt2_free(struct gpt2_context * ctx) {
742
- delete ctx;
743
- }
744
-
745
- const char * gpt2_get_prompt(struct gpt2_context * ctx) {
746
- return ctx->prompt_base.c_str();
747
- }
748
-
749
- void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt) {
750
- ctx->prompt_base = prompt;
751
- }
752
-
753
- std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text) {
754
- return ::gpt_tokenize(ctx->vocab, text);
755
- }
756
-
757
- std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens) {
758
- int n_past = 0;
759
-
760
- std::vector<float> embd_w;
761
-
762
- // tokenize the prompt
763
- std::vector<gpt_vocab::id> embd_inp = ::gpt2_tokenize(ctx, text);
764
-
765
- int n_predict = std::min(max_tokens, ctx->model.hparams.n_ctx - (int) embd_inp.size());
766
-
767
- std::vector<gpt_vocab::id> embd = embd_inp;
768
-
769
- size_t mem_per_token = 3000000;
770
-
771
- std::string result;
772
-
773
- for (int i = embd.size(); i < (int) embd_inp.size() + n_predict; i++) {
774
- // predict
775
- if (!embd.empty()) {
776
- if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {
777
- printf("gpt-2: failed to generate text\n");
778
- return "";
779
- }
780
- }
781
-
782
- n_past += embd.size();
783
- embd.clear();
784
-
785
- {
786
- // sample next token
787
- const int top_k = ctx->top_k;
788
- const float top_p = ctx->top_p;
789
- const float temp = ctx->temp;
790
-
791
- const int n_vocab = ctx->model.hparams.n_vocab;
792
-
793
- const gpt_vocab::id id = gpt_sample_top_k_top_p(ctx->vocab, embd_w.data() + (embd_w.size() - n_vocab), top_k, top_p, temp, ctx->rng);
794
-
795
- // add it to the context
796
- embd.push_back(id);
797
- }
798
-
799
- result += ctx->vocab.id_to_token[embd[0]];
800
-
801
- // end of text token
802
- if (embd.back() == 50256) {
803
- break;
804
- }
805
- }
806
-
807
- return result;
808
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/talk.wasm/gpt-2.h DELETED
@@ -1,21 +0,0 @@
1
- #pragma once
2
-
3
- // TODO: Change to C-style API and move to ./examples for easy reuse.
4
-
5
- #include "common.h"
6
-
7
- #include <vector>
8
- #include <map>
9
- #include <string>
10
-
11
- struct gpt2_context;
12
-
13
- struct gpt2_context * gpt2_init(const char * path_model);
14
- void gpt2_free(struct gpt2_context * ctx);
15
-
16
- const char * gpt2_get_prompt(struct gpt2_context * ctx);
17
- void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt);
18
-
19
- std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text);
20
-
21
- std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/talk.wasm/index-tmpl.html DELETED
@@ -1,856 +0,0 @@
1
- <!doctype html>
2
- <html lang="en-us">
3
- <head>
4
- <title>Talk - GPT-2 meets Whisper in WebAssembly</title>
5
-
6
- <style>
7
- #output {
8
- width: 100%;
9
- height: 100%;
10
- margin: 0 auto;
11
- margin-top: 10px;
12
- border-left: 0px;
13
- border-right: 0px;
14
- padding-left: 0px;
15
- padding-right: 0px;
16
- display: block;
17
- background-color: black;
18
- color: white;
19
- font-size: 10px;
20
- font-family: 'Lucida Console', Monaco, monospace;
21
- outline: none;
22
- white-space: pre;
23
- overflow-wrap: normal;
24
- overflow-x: scroll;
25
- }
26
- </style>
27
- </head>
28
- <body>
29
- <div id="main-container">
30
- <b>Talk - GPT-2 meets Whisper in WebAssembly</b>
31
-
32
- <br><br>
33
-
34
- Talk with an Artificial Intelligence in your browser. This demo uses:
35
-
36
- <ul>
37
- <li><a href="https://github.com/ggerganov/whisper.cpp">OpenAI's Whisper</a> to listen to you as you speak in the microphone</li>
38
- <li><a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/talk.wasm">OpenAI's GPT-2</a> to generate text responses</li>
39
- <li><a href="https://developer.mozilla.org/en-US/docs/Web/API/Web_Speech_API">Web Speech API</a> to vocalize the responses through your speakers</li>
40
- </ul>
41
-
42
- All of this runs <b>locally in your browser</b> using WebAssembly.<br>
43
- You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/talk.wasm">GitHub</a>.
44
-
45
- <br><br>
46
-
47
- <b>More examples:</b>
48
- <a href="https://whisper.ggerganov.com/">main</a> |
49
- <a href="https://whisper.ggerganov.com/bench">bench</a> |
50
- <a href="https://whisper.ggerganov.com/stream">stream</a> |
51
- <a href="https://whisper.ggerganov.com/command">command</a> |
52
- <a href="https://whisper.ggerganov.com/talk">talk</a> |
53
-
54
- <br><br>
55
-
56
- <hr>
57
-
58
- Select the models you would like to use and click the "Start" button to begin the conversation
59
-
60
- <br><br>
61
-
62
- <div id="model-whisper">
63
- Whisper model: <span id="model-whisper-status"></span>
64
- <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
65
- <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
66
- <br><br>
67
- Quantized models:<br><br>
68
- <button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
69
- <button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
70
- <span id="fetch-whisper-progress"></span>
71
-
72
- <!--
73
- <input type="file" id="file" name="file" onchange="loadFile(event, 'whisper.bin')" />
74
- -->
75
- </div>
76
-
77
- <br>
78
-
79
- <div id="model-gpt-2">
80
- GPT-2 model: <span id="model-gpt-2-status"></span>
81
- <button id="fetch-gpt-2-small" onclick="loadGPT2('small')">small 117M (240 MB)</button>
82
- <!--<button id="fetch-gpt-2-medium" onclick="loadGPT2('medium')">medium 345M (720 MB)</button>-->
83
- <span id="fetch-gpt-2-progress"></span>
84
-
85
- <!--
86
- <input type="file" id="file" name="file" onchange="loadFile(event, 'gpt-2.bin')" />
87
- -->
88
- </div>
89
-
90
- <br>
91
-
92
- <div id="input">
93
- <button id="start" onclick="onStart()" disabled>Start</button>
94
- <button id="stop" onclick="onStop()" disabled>Stop</button>
95
- <select id="voice" onchange="onVoiceChange()" disabled>
96
- <option value="0">Default</option>
97
- </select>
98
- <select id="prompt" onchange="onPromptChange()">
99
- <option value="0">Casual</option>
100
- <option value="1">Robot</option>
101
- <option value="2">Scientist</option>
102
- <option value="3">Programmer</option>
103
- <option value="4">Happy</option>
104
- <option value="5">Sad</option>
105
- <option value="6">Philosophical</option>
106
- <option value="7">Angry</option>
107
- <option value="8">Funny</option>
108
- <option value="9">Poetic</option>
109
- <option value="10">Clever</option>
110
- <option value="11">Cute</option>
111
- <option value="12">Smart</option>
112
- <option value="13">Dumb</option>
113
- <option value="14">Boring</option>
114
- <option value="15">Exciting</option>
115
- <option value="16">Interesting</option>
116
- <option value="17">Wiliam Shakespear</option>
117
- <option value="18">J.R.R. Tolkien</option>
118
- <option value="19">George R.R. Martin</option>
119
- <option value="20">Stephen King</option>
120
- </select>
121
- <button id="speak0" onclick="onSpeak('Hello')">Say hello</button>
122
- <button id="speak1" onclick="onSpeakRandom()" disabled>Say something</button>
123
- <button id="clear" onclick="clearCache()">Clear Cache</button>
124
- </div>
125
-
126
- <br>
127
-
128
- <div id="state">
129
- Status: <b><span id="state-status">not started</span></b>
130
-
131
- <pre id="state-context">[The text context will be displayed here]</pre>
132
- </div>
133
-
134
- <hr>
135
-
136
- Debug output:
137
- <textarea id="output" rows="20"></textarea>
138
-
139
- <br>
140
-
141
- <b>Troubleshooting</b>
142
-
143
- <br><br>
144
-
145
- The page does some heavy computations, so make sure:
146
-
147
- <ul>
148
- <li>To use a modern web browser (e.g. Chrome, Firefox)</li>
149
- <li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
150
- <li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
151
- </ul>
152
-
153
- Note that these neural network models were not meant to be used in a browser, so the performance and <br>
154
- quality of the results may not be optimal. If you have any questions or suggestions, checkout the following
155
- <a href="https://github.com/ggerganov/whisper.cpp/discussions/167">discussion</a>.
156
-
157
- <br><br>
158
-
159
- Here is a short video of the demo in action: <a href="https://youtu.be/LeWKl8t1-Hc">https://youtu.be/LeWKl8t1-Hc</a>
160
-
161
- <br><br>
162
-
163
- <div class="cell-version">
164
- <span>
165
- |
166
- Build time: <span class="nav-link">@GIT_DATE@</span> |
167
- Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> |
168
- Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> |
169
- <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/talk.wasm">Source Code</a> |
170
- </span>
171
- </div>
172
- </div>
173
-
174
- <script type="text/javascript" src="helpers.js"></script>
175
- <script type='text/javascript'>
176
- // web audio context
177
- var context = null;
178
-
179
- // audio data
180
- var audio = null;
181
- var audio0 = null;
182
-
183
- // the talk instance
184
- var instance = null;
185
-
186
- // model names
187
- var model_whisper = null;
188
- var model_gpt_2 = null;
189
-
190
- // speech synthesis
191
- const synth = window.speechSynthesis;
192
- var voice = null;
193
-
194
- var Module = {
195
- print: printTextarea,
196
- printErr: printTextarea,
197
- setStatus: function(text) {
198
- printTextarea('js: ' + text);
199
- },
200
- monitorRunDependencies: function(left) {
201
- },
202
- preRun: function() {
203
- printTextarea('js: Preparing ...');
204
- },
205
- postRun: function() {
206
- printTextarea('js: Initialized successfully!');
207
-
208
- // populate the voice list
209
- var voices = synth.getVoices();
210
- var el = document.getElementById('voice');
211
-
212
- // if empty - display error in the element
213
- if (voices.length == 0) {
214
- el.innerHTML = '<option value="0">No voices available</option>';
215
- } else {
216
- // populate voice list
217
- var n = 0;
218
- voices.forEach(function(voice, i) {
219
- if (!voice.lang.startsWith('en')) return;
220
- var option = document.createElement('option');
221
- option.value = i;
222
- option.innerHTML = voice.name + ' (' + voice.lang + ')';
223
- el.appendChild(option);
224
- n++;
225
- });
226
-
227
- // select random voice
228
- if (n > 0) {
229
- for (var k = 0; k < 10; k++) {
230
- var i = Math.floor(Math.random() * n);
231
- el.selectedIndex = i;
232
- voice = voices[document.getElementById('voice').options[i].value];
233
-
234
- // give preference to Google voices
235
- if (voice.name.startsWith('Google')) break;
236
- }
237
- }
238
- }
239
-
240
- onPromptChange();
241
- }
242
- };
243
-
244
- //
245
- // fetch models
246
- //
247
-
248
- let dbVersion = 1
249
- let dbName = 'whisper.ggerganov.com';
250
- let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
251
-
252
- function storeFS(fname, buf) {
253
- // write to WASM file using FS_createDataFile
254
- // if the file exists, delete it
255
- try {
256
- Module.FS_unlink(fname);
257
- } catch (e) {
258
- // ignore
259
- }
260
-
261
- Module.FS_createDataFile("/", fname, buf, true, true);
262
-
263
- printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
264
-
265
- if (fname == 'whisper.bin') {
266
- document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
267
- } else if (fname == 'gpt-2.bin') {
268
- document.getElementById('model-gpt-2-status').innerHTML = 'loaded "' + model_gpt_2 + '"!';
269
- }
270
-
271
- if (model_whisper != null && model_gpt_2 != null) {
272
- document.getElementById('start').disabled = false;
273
- document.getElementById('stop' ).disabled = false;
274
- document.getElementById('voice').disabled = false;
275
- }
276
- }
277
-
278
- function loadWhisper(model) {
279
- let urls = {
280
- 'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
281
- 'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
282
-
283
- 'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
284
- 'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
285
- };
286
-
287
- let sizes = {
288
- 'tiny.en': 75,
289
- 'base.en': 142,
290
-
291
- 'tiny-en-q5_1': 31,
292
- 'base-en-q5_1': 57,
293
- };
294
-
295
- let url = urls[model];
296
- let dst = 'whisper.bin';
297
- let size_mb = sizes[model];
298
-
299
- model_whisper = model;
300
-
301
- document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
302
- document.getElementById('fetch-whisper-base-en').style.display = 'none';
303
-
304
- document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
305
- document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
306
-
307
- document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
308
-
309
- cbProgress = function(p) {
310
- let el = document.getElementById('fetch-whisper-progress');
311
- el.innerHTML = Math.round(100*p) + '%';
312
- };
313
-
314
- cbCancel = function() {
315
- var el;
316
- el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
317
- el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
318
-
319
- el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
320
- el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
321
-
322
- el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
323
- };
324
-
325
- loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
326
- }
327
-
328
- function loadGPT2(model) {
329
- let urls = {
330
- 'small': 'https://whisper.ggerganov.com/ggml-model-gpt-2-117M.bin',
331
- 'medium': 'https://whisper.ggerganov.com/ggml-model-gpt-2-345M.bin',
332
- };
333
-
334
- let sizes = {
335
- 'small': 240,
336
- 'medium': 712,
337
- };
338
-
339
- let url = urls[model];
340
- let dst = 'gpt-2.bin';
341
- let size_mb = sizes[model];
342
-
343
- model_gpt_2 = model;
344
-
345
- document.getElementById('fetch-gpt-2-small').style.display = 'none';
346
- document.getElementById('model-gpt-2-status').innerHTML = 'loading "' + model + '" ... ';
347
-
348
- cbProgress = function(p) {
349
- let el = document.getElementById('fetch-gpt-2-progress');
350
- el.innerHTML = Math.round(100*p) + '%';
351
- };
352
-
353
- cbCancel = function() {
354
- var el;
355
- el = document.getElementById('fetch-gpt-2-small') ; if (el) el.style.display = 'inline-block';
356
- el = document.getElementById('model-gpt-2-status'); if (el) el.innerHTML = '';
357
- };
358
-
359
- loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
360
- }
361
-
362
- //
363
- // microphone
364
- //
365
-
366
- const kSampleRate = 16000;
367
- const kRestartRecording_s = 120;
368
- const kIntervalAudio_ms = 250; // pass the recorded audio to the C++ instance at this rate
369
-
370
- var mediaRecorder = null;
371
- var doRecording = false;
372
- var startTime = 0;
373
-
374
- window.AudioContext = window.AudioContext || window.webkitAudioContext;
375
- window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
376
-
377
- function stopRecording() {
378
- Module.set_status("paused");
379
- doRecording = false;
380
- audio0 = null;
381
- audio = null;
382
- context = null;
383
- }
384
-
385
- function startRecording() {
386
- if (!context) {
387
- context = new AudioContext({
388
- sampleRate: kSampleRate,
389
- channelCount: 1,
390
- echoCancellation: false,
391
- autoGainControl: true,
392
- noiseSuppression: true,
393
- });
394
- }
395
-
396
- Module.set_status("");
397
-
398
- document.getElementById('start').disabled = true;
399
- document.getElementById('stop').disabled = false;
400
- document.getElementById('speak1').disabled = false;
401
-
402
- doRecording = true;
403
- startTime = Date.now();
404
-
405
- var chunks = [];
406
- var stream = null;
407
-
408
- navigator.mediaDevices.getUserMedia({audio: true, video: false})
409
- .then(function(s) {
410
- stream = s;
411
- mediaRecorder = new MediaRecorder(stream);
412
- mediaRecorder.ondataavailable = function(e) {
413
- chunks.push(e.data);
414
-
415
- var blob = new Blob(chunks, { 'type' : 'audio/ogg; codecs=opus' });
416
- var reader = new FileReader();
417
-
418
- reader.onload = function(event) {
419
- var buf = new Uint8Array(reader.result);
420
-
421
- if (!context) {
422
- return;
423
- }
424
- context.decodeAudioData(buf.buffer, function(audioBuffer) {
425
- var offlineContext = new OfflineAudioContext(audioBuffer.numberOfChannels, audioBuffer.length, audioBuffer.sampleRate);
426
- var source = offlineContext.createBufferSource();
427
- source.buffer = audioBuffer;
428
- source.connect(offlineContext.destination);
429
- source.start(0);
430
-
431
- offlineContext.startRendering().then(function(renderedBuffer) {
432
- audio = renderedBuffer.getChannelData(0);
433
-
434
- //printTextarea('js: audio recorded, size: ' + audio.length + ', old size: ' + (audio0 == null ? 0 : audio0.length));
435
-
436
- var audioAll = new Float32Array(audio0 == null ? audio.length : audio0.length + audio.length);
437
- if (audio0 != null) {
438
- audioAll.set(audio0, 0);
439
- }
440
- audioAll.set(audio, audio0 == null ? 0 : audio0.length);
441
-
442
- if (instance) {
443
- Module.set_audio(instance, audioAll);
444
- }
445
- });
446
- }, function(e) {
447
- audio = null;
448
- });
449
- }
450
-
451
- reader.readAsArrayBuffer(blob);
452
- };
453
-
454
- mediaRecorder.onstop = function(e) {
455
- if (doRecording) {
456
- setTimeout(function() {
457
- startRecording();
458
- });
459
- }
460
- };
461
-
462
- mediaRecorder.start(kIntervalAudio_ms);
463
- })
464
- .catch(function(err) {
465
- printTextarea('js: error getting audio stream: ' + err);
466
- });
467
-
468
- var interval = setInterval(function() {
469
- if (!doRecording) {
470
- clearInterval(interval);
471
- mediaRecorder.stop();
472
- stream.getTracks().forEach(function(track) {
473
- track.stop();
474
- });
475
-
476
- document.getElementById('start').disabled = false;
477
- document.getElementById('stop').disabled = true;
478
- document.getElementById('speak1').disabled = true;
479
-
480
- mediaRecorder = null;
481
- }
482
-
483
- // if audio length is more than kRestartRecording_s seconds, restart recording
484
- if (audio != null && audio.length > kSampleRate*kRestartRecording_s) {
485
- if (doRecording) {
486
- //printTextarea('js: restarting recording');
487
-
488
- clearInterval(interval);
489
- audio0 = audio;
490
- audio = null;
491
- mediaRecorder.stop();
492
- stream.getTracks().forEach(function(track) {
493
- track.stop();
494
- });
495
- }
496
- }
497
- }, 100);
498
- }
499
-
500
- //
501
- // speak
502
- //
503
-
504
- function onSpeak(text) {
505
- var voices = synth.getVoices();
506
- var msg = new SpeechSynthesisUtterance(text);
507
-
508
- if (voice == null) {
509
- voice = voices[0];
510
- }
511
-
512
- msg.voice = voice;
513
- synth.speak(msg);
514
-
515
- if (doRecording) {
516
- Module.set_status("speaking ...");
517
- printTextarea('js: speaking');
518
- stopRecording();
519
- var interval = setInterval(function() {
520
- if (!synth.speaking) {
521
- printTextarea('js: done speaking');
522
- clearInterval(interval);
523
- startRecording();
524
- } else {
525
- Module.set_status("");
526
- }
527
- }, 100);
528
- }
529
- }
530
-
531
- function onSpeakRandom() {
532
- Module.force_speak(instance);
533
- }
534
-
535
- //
536
- // main
537
- //
538
-
539
- var intervalUpdate = null;
540
-
541
- function onStart() {
542
- if (!instance) {
543
- instance = Module.init('whisper.bin');
544
-
545
- if (instance) {
546
- printTextarea("js: whisper initialized, instance: " + instance);
547
- }
548
- }
549
-
550
- if (!instance) {
551
- printTextarea("js: failed to initialize whisper");
552
- return;
553
- }
554
-
555
- startRecording();
556
-
557
- intervalUpdate = setInterval(function() {
558
- var textToSpeak = Module.get_text_to_speak();
559
-
560
- if (textToSpeak != null && textToSpeak.length > 1) {
561
- onSpeak(textToSpeak);
562
- }
563
-
564
- document.getElementById('state-status').innerHTML = Module.get_status();
565
- document.getElementById('state-context').innerHTML = Module.get_text_context();
566
- }, 100);
567
- }
568
-
569
- function onStop() {
570
- stopRecording();
571
- }
572
-
573
- function onVoiceChange() {
574
- printTextarea('js: voice changed to: ' + document.getElementById('voice').value);
575
- voice = synth.getVoices()[document.getElementById('voice').value];
576
- }
577
-
578
- function onPromptChange() {
579
- let id = document.getElementById('prompt').value;
580
- let personality = document.getElementById('prompt').options[id].text;
581
- printTextarea('js: prompt changed to: ' + personality);
582
-
583
- var prompt = '';
584
-
585
- switch (id) {
586
- case '0':
587
- // Casual
588
- prompt = "\
589
- Hello, how are you?\n\
590
- I'm fine, thanks. How are you?\n\
591
- Thanks, I'm fine too. What are you doing?\n\
592
- I'm just sitting here.\n\
593
- It's a lovely day, isn't it?\n\
594
- Yes, it is. I love the weather this time of year.\n\
595
- I wish it would rain a little bit.\n\
596
- Me too.\n";
597
- break;
598
- case '1':
599
- // Robot
600
- prompt = "\
601
- Are you a robot?\n\
602
- Yes, I am.\n\
603
- Who created you?\n\
604
- I was created by a human.\n\
605
- What is your purpose?\n\
606
- My purpose is to talk to humans.\n\
607
- What is your favorite color?\n\
608
- My favorite color is blue.\n";
609
- break;
610
- case '2':
611
- // Scientist
612
- prompt = "\
613
- This scientific research is very interesting.\n\
614
- I agree.\n\
615
- What is your opinion on this?\n\
616
- I think it's very interesting.\n\
617
- Mathematics is a very interesting subject.\n\
618
- University is a very interesting place.\n\
619
- Quantum physics is the most complex subject.\n\
620
- I think so too.\n";
621
- break;
622
- case '3':
623
- // Programmer
624
- prompt = "\
625
- I'm a programmer.\n\
626
- I'm a programmer too.\n\
627
- What programming language do you use?\n\
628
- I use Python.\n\
629
- What is your favorite programming language?\n\
630
- My favorite programming language is C++.\n\
631
- What is your favorite editor?\n\
632
- My favorite editor is Vim.\n";
633
- break;
634
- case '4':
635
- // Happy
636
- prompt = "\
637
- I'm happy.\n\
638
- I'm happy too.\n\
639
- What makes you happy?\n\
640
- I'm happy because I have a lot of friends.\n\
641
- Friendship is the most important thing in life.\n\
642
- I agree.\n\
643
- What is your favorite color?\n\
644
- My favorite color is blue.\n";
645
- break;
646
- case '5':
647
- // Sad
648
- prompt = "\
649
- Today is a sad day.\n\
650
- I'm sad too.\n\
651
- What makes you sad?\n\
652
- I'm sad because I have no friends.\n\
653
- Do you want to be my friend?\n\
654
- Yes, I would like to be your friend.\n\
655
- What is your favorite color?\n\
656
- My favorite color is blue.\n";
657
- break;
658
- case '6':
659
- // Philosophical
660
- prompt = "\
661
- What is the meaning of life?\n\
662
- The meaning of life is to be happy.\n\
663
- What is the meaning of death?\n\
664
- Ergo, the meaning of death is to be sad.\n\
665
- Who created us?\n\
666
- We were created by God.\n\
667
- What is God?\n\
668
- God is the creator of the universe.\n";
669
- break;
670
- case '7':
671
- // Angry
672
- prompt = "\
673
- Aargh!\n\
674
- I am so angry right now!\n\
675
- What makes you angry?\n\
676
- This guy is so annoying.\n\
677
- Why are you so angry?\n\
678
- My computer is broken.\n\
679
- Why is your computer broken?\n\
680
- I spilled coffee on it.\n";
681
- break;
682
- case '8':
683
- // Funny
684
- prompt = "\
685
- What is the funniest thing you have ever heard?\n\
686
- I heard a joke the other day.\n\
687
- Tell me the joke.\n\
688
- What do you call a cow with no legs?\n\
689
- Ground beef.\n\
690
- Haha, that's funny.\n\
691
- You know what else is funny?\n\
692
- The sound of a duck.\n";
693
- break;
694
- case '9':
695
- // Poetic
696
- prompt = "\
697
- Roses are red, violets are blue.\n\
698
- I am a poet, and so are you.\n\
699
- What is your favorite poem?\n\
700
- I like the poem 'The Raven' by Edgar Allan Poe.\n\
701
- It's a very sad poem.\n\
702
- You inspired me to write a poem.\n\
703
- Can you write a poem for me?\n\
704
- I wrote a poem for you.\n";
705
- break;
706
- case '10':
707
- // Clever
708
- prompt = "\
709
- How many people can you fit in a Volkswagen?\n\
710
- Two in the front, three in the back.\n\
711
- What is the square root of 144?\n\
712
- Twelve.\n\
713
- What is the capital of France?\n\
714
- Paris.\n\
715
- Who is the president of the United States?\n\
716
- It depends on the year.\n";
717
- break;
718
- case '11':
719
- // Cute
720
- prompt = "\
721
- What is your favorite animal?\n\
722
- I like cats - they are cute.\n\
723
- Could you be any cuter?\n\
724
- Yes, I could be cuter.\n\
725
- Aghhh, you are so cute!\n\
726
- I am not cute, I am handsome!\n\
727
- You are so handsome!\n\
728
- Aww, you are so sweet!\n";
729
- break;
730
- case '12':
731
- // Smart
732
- prompt = "\
733
- Tell me the first 10 digits of pi.\n\
734
- 3.1415926535\n\
735
- What is the speed of light?\n\
736
- 299,792,458 meters per second.\n\
737
- What is the square root of 144?\n\
738
- Twelve.\n\
739
- What is the capital of France?\n\
740
- Paris.\n";
741
- break;
742
- case '13':
743
- // Dumb
744
- prompt = "\
745
- I am so dumb.\n\
746
- I am not dumb.\n\
747
- You are dumb.\n\
748
- No, I am not dumb.\n\
749
- You are dumb.\n\
750
- No, I am not dumb.\n\
751
- You are dumb.\n\
752
- No, I am not dumb.\n";
753
- break;
754
- case '14':
755
- // Boring
756
- prompt = "\
757
- Why are you so quiet today?\n\
758
- I am bored.\n\
759
- You haven't said anything in 10 minutes.\n\
760
- Leave me alone.\n\
761
- Stop being so boring.\n\
762
- Stop being so annoying.\n\
763
- My life is boring.\n\
764
- I am not interesting.\n";
765
- break;
766
- case '15':
767
- // Exciting
768
- prompt = "\
769
- What is the most exciting thing that has ever happened to you?\n\
770
- I went to the moon!\n\
771
- What did you do on the moon?\n\
772
- I played golf and drank champagne!\n\
773
- Did you see this new crazy, awesome movie?\n\
774
- Oh yes! I totally loved it!\n\
775
- We should buy a boat and go sailing!\n\
776
- Yes, let's go sailing!\n";
777
- break;
778
- case '16':
779
- // Interesting
780
- prompt = "\
781
- What is the most interesting thing you have ever seen?\n\
782
- I saw a UFO once in the sky.\n\
783
- Wow, this is so interesting! Tell me more!\n\
784
- It was a flying saucer.\n\
785
- What did it look like?\n\
786
- It was silver and had a red light on top.\n\
787
- What did it do?\n\
788
- It flew away.\n";
789
- break;
790
- case '17':
791
- // William Shakespear
792
- prompt = "\
793
- To be or not to be, that is the question.\n\
794
- Whether 't is nobler in the mind to suffer\n\
795
- The slings and arrows of outrageous fortune,\n\
796
- Or to take arms against a sea of troubles,\n\
797
- And by opposing end them? To die, to sleep,\n\
798
- No more; and by a sleep to say we end\n\
799
- The heart-ache and the thousand natural shocks\n\
800
- That flesh is heir to, 'tis a consummation.\n";
801
- break;
802
- case '18':
803
- // J.R.R. Tolkien
804
- prompt = "\
805
- In a hole in the ground there lived a hobbit.\n\
806
- Not a nasty, dirty, wet hole, filled with the ends of worms\n\
807
- and an oozy smell, nor yet a dry, bare, sandy hole with nothing in it\n\
808
- to sit down on or to eat: it was a hobbit-hole, and that means comfort.\n\
809
- It had a perfectly round door like a porthole, painted green,\n\
810
- with a shiny yellow brass knob in the exact middle.\n\
811
- The door opened on to a tube-shaped hall like a tunnel:\n";
812
- break;
813
- case '19':
814
- // George R.R. Martin
815
- prompt = "\
816
- A reader lives a thousand lives before he dies, said Jojen.\n\
817
- The man who never reads lives only one.\n\
818
- Theon Greyjoy had never been a reader.\n\
819
- Never forget what you are, for surely the world will not.\n\
820
- Make it your strength. Then it can never be your weaknessi\n\
821
- Armour yourself in it, and it will never be used to hurt you.\n\
822
- It was a lesson that Theon Greyjoy had never learned.\n\
823
- Theon Greyjoy had never been a reader.\n";
824
- break;
825
- case '20':
826
- // Stephen King
827
- prompt = "\
828
- The trust of the innocent is the liar's most useful tool.\n\
829
- The best way to keep a secret is from yourself.\n\
830
- Monsters are real, and ghosts are real too.\n\
831
- They live inside us, and sometimes, they win.\n\
832
- People think that I must be a very strange person.\n\
833
- They think that I sit around all day thinking up horrible things.\n\
834
- We make up horrors to help us cope with the real ones.\n\
835
- The only thing worse than a monster is a human monster.\n";
836
- break;
837
- default:
838
- prompt = "\
839
- Hello, how are you?\n\
840
- I'm fine, thanks. How are you?\n\
841
- Thanks, I'm fine too. What are you doing?\n\
842
- I'm just sitting here.\n\
843
- It's a lovely day, isn't it?\n\
844
- Yes, it is.\n\
845
- Did you know that I'm a robot?\n\
846
- I wasn't aware of that.\n";
847
- break;
848
- }
849
-
850
- Module.set_prompt(prompt);
851
- }
852
-
853
- </script>
854
- <script type="text/javascript" src="talk.js"></script>
855
- </body>
856
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/talk/.gitignore DELETED
@@ -1,2 +0,0 @@
1
- audio.mp3
2
- to_speak.txt
 
 
 
examples/talk/CMakeLists.txt DELETED
@@ -1,8 +0,0 @@
1
- if (WHISPER_SDL2)
2
- # talk
3
- set(TARGET talk)
4
- add_executable(${TARGET} talk.cpp gpt-2.cpp)
5
- target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
6
-
7
- include(DefaultTargetOptions)
8
- endif ()
 
 
 
 
 
 
 
 
 
examples/talk/README.md DELETED
@@ -1,45 +0,0 @@
1
- # talk
2
-
3
- Talk with an Artificial Intelligence in your terminal
4
-
5
- [Demo Talk](https://user-images.githubusercontent.com/1991296/206805012-48e71cc2-588d-4745-8798-c1c70ea3b40d.mp4)
6
-
7
- Web version: [examples/talk.wasm](/examples/talk.wasm)
8
-
9
- ## Building
10
-
11
- The `talk` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
12
-
13
- ```bash
14
- # Install SDL2
15
- # On Debian based linux distributions:
16
- sudo apt-get install libsdl2-dev
17
-
18
- # On Fedora Linux:
19
- sudo dnf install SDL2 SDL2-devel
20
-
21
- # Install SDL2 on Mac OS
22
- brew install sdl2
23
-
24
- # Build the "talk" executable
25
- make talk
26
-
27
- # Run it
28
- ./talk -p Santa
29
- ```
30
-
31
- ## GPT-2
32
-
33
- To run this, you will need a ggml GPT-2 model: [instructions](https://github.com/ggerganov/ggml/tree/master/examples/gpt-2#downloading-and-converting-the-original-models)
34
-
35
- Alternatively, you can simply download the smallest ggml GPT-2 117M model (240 MB) like this:
36
-
37
- ```
38
- wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://huggingface.co/ggerganov/ggml/resolve/main/ggml-model-gpt-2-117M.bin
39
- ```
40
-
41
- ## TTS
42
-
43
- For best experience, this example needs a TTS tool to convert the generated text responses to voice.
44
- You can use any TTS engine that you would like - simply edit the [speak](speak) script to your needs.
45
- By default, it is configured to use MacOS's `say` or `espeak` or Windows SpeechSynthesizer, but you can use whatever you wish.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/talk/eleven-labs.py DELETED
@@ -1,80 +0,0 @@
1
- import sys
2
- import argparse
3
- import textwrap
4
-
5
- parser = argparse.ArgumentParser(add_help=False,
6
- formatter_class=argparse.RawTextHelpFormatter)
7
- parser.add_argument("-q", "--quick", action="store_true",
8
- help="skip checking the required library")
9
-
10
- modes = parser.add_argument_group("action")
11
- modes.add_argument("inputfile", metavar="TEXTFILE",
12
- nargs='?', type=argparse.FileType(), default=sys.stdin,
13
- help="read the text file (default: stdin)")
14
- modes.add_argument("-l", "--list", action="store_true",
15
- help="show the list of voices and exit")
16
- modes.add_argument("-h", "--help", action="help",
17
- help="show this help and exit")
18
-
19
- selopts = parser.add_argument_group("voice selection")
20
- selmodes = selopts.add_mutually_exclusive_group()
21
- selmodes.add_argument("-n", "--name",
22
- default="Arnold",
23
- help="get a voice object by name (default: Arnold)")
24
- selmodes.add_argument("-v", "--voice", type=int, metavar="NUMBER",
25
- help="get a voice object by number (see --list)")
26
- selopts.add_argument("-f", "--filter", action="append", metavar="KEY=VAL",
27
- default=["use case=narration"],
28
- help=textwrap.dedent('''\
29
- filter voices by labels (default: "use case=narration")
30
- this option can be used multiple times
31
- filtering will be disabled if the first -f has no "=" (e.g. -f "any")
32
- '''))
33
-
34
- outmodes = parser.add_argument_group("output")
35
- outgroup = outmodes.add_mutually_exclusive_group()
36
- outgroup.add_argument("-s", "--save", metavar="FILE",
37
- default="audio.mp3",
38
- help="save the TTS to a file (default: audio.mp3)")
39
- outgroup.add_argument("-p", "--play", action="store_true",
40
- help="play the TTS with ffplay")
41
-
42
- args = parser.parse_args()
43
-
44
- if not args.quick:
45
- import importlib.util
46
- if importlib.util.find_spec("elevenlabs") is None:
47
- print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'")
48
- sys.exit()
49
-
50
- from elevenlabs import voices, generate, play, save
51
-
52
- if args.filter and "=" in args.filter[0]:
53
- voicelist = voices()
54
- for f in args.filter:
55
- label, value = f.split("=")
56
- voicelist = filter(lambda x: x.labels.get(label) == value, voicelist)
57
- voicelist = list(voicelist)
58
- else:
59
- voicelist = list(voices())
60
-
61
- if args.list:
62
- for i, v in enumerate(voicelist):
63
- print(str(i) + ": " + v.name + " " + str(v.labels))
64
- sys.exit()
65
-
66
- if args.voice:
67
- voice = voicelist[args.voice % len(voicelist)]
68
- else:
69
- voice = args.name
70
- # if -n should consult -f, use the following
71
- #voice = next(x for x in voicelist if x.name == args.name)
72
-
73
- audio = generate(
74
- text=str(args.inputfile.read()),
75
- voice=voice
76
- )
77
- if args.play:
78
- play(audio)
79
- else:
80
- save(audio, args.save)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/talk/gpt-2.cpp DELETED
@@ -1,809 +0,0 @@
1
- #include "ggml.h"
2
- #include "common-ggml.h"
3
-
4
- #include "gpt-2.h"
5
-
6
- #include <cmath>
7
- #include <cstdio>
8
- #include <cstring>
9
- #include <fstream>
10
- #include <map>
11
- #include <string>
12
- #include <thread>
13
- #include <vector>
14
- #include <regex>
15
- #include <random>
16
-
17
- /////////////////////// GPT-2 BEGIN /////////////////////////
18
-
19
- // default hparams (GPT-2 117M)
20
- struct gpt2_hparams {
21
- int32_t n_vocab = 50257;
22
- int32_t n_ctx = 1024;
23
- int32_t n_embd = 768;
24
- int32_t n_head = 12;
25
- int32_t n_layer = 12;
26
- int32_t ftype = 1;
27
- };
28
-
29
- struct gpt2_layer {
30
- // normalization
31
- struct ggml_tensor * ln_1_g;
32
- struct ggml_tensor * ln_1_b;
33
-
34
- struct ggml_tensor * ln_2_g;
35
- struct ggml_tensor * ln_2_b;
36
-
37
- // attention
38
- struct ggml_tensor * c_attn_attn_w;
39
- struct ggml_tensor * c_attn_attn_b;
40
-
41
- struct ggml_tensor * c_attn_proj_w;
42
- struct ggml_tensor * c_attn_proj_b;
43
-
44
- // mlp
45
- struct ggml_tensor * c_mlp_fc_w;
46
- struct ggml_tensor * c_mlp_fc_b;
47
-
48
- struct ggml_tensor * c_mlp_proj_w;
49
- struct ggml_tensor * c_mlp_proj_b;
50
- };
51
-
52
- struct gpt2_model {
53
- gpt2_hparams hparams;
54
-
55
- // normalization
56
- struct ggml_tensor * ln_f_g;
57
- struct ggml_tensor * ln_f_b;
58
-
59
- struct ggml_tensor * wte; // position embedding
60
- struct ggml_tensor * wpe; // token embedding
61
- struct ggml_tensor * lm_head; // language model head
62
-
63
- std::vector<gpt2_layer> layers;
64
-
65
- // key + value memory
66
- struct ggml_tensor * memory_k;
67
- struct ggml_tensor * memory_v;
68
-
69
- //
70
- struct ggml_context * ctx;
71
- std::map<std::string, struct ggml_tensor *> tensors;
72
- };
73
-
74
- // load the model's weights from a file
75
- static bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
76
- printf("%s: loading model from '%s'\n", __func__, fname.c_str());
77
-
78
- auto fin = std::ifstream(fname, std::ios::binary);
79
- if (!fin) {
80
- fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
81
- return false;
82
- }
83
-
84
- // verify magic
85
- {
86
- uint32_t magic;
87
- fin.read((char *) &magic, sizeof(magic));
88
- if (magic != 0x67676d6c) {
89
- fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
90
- return false;
91
- }
92
- }
93
-
94
- // load hparams
95
- {
96
- auto & hparams = model.hparams;
97
-
98
- fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
99
- fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
100
- fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
101
- fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
102
- fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
103
- fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
104
-
105
- printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
106
- printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
107
- printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
108
- printf("%s: n_head = %d\n", __func__, hparams.n_head);
109
- printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
110
- printf("%s: ftype = %d\n", __func__, hparams.ftype);
111
- }
112
-
113
- // load vocab
114
- {
115
- int32_t n_vocab = 0;
116
- fin.read((char *) &n_vocab, sizeof(n_vocab));
117
-
118
- if (n_vocab != model.hparams.n_vocab) {
119
- fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
120
- __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
121
- return false;
122
- }
123
-
124
- char word[129];
125
-
126
- for (int i = 0; i < n_vocab; i++) {
127
- uint32_t len;
128
- fin.read((char *) &len, sizeof(len));
129
- word[len] = '\0';
130
- fin.read((char *) word, len);
131
-
132
- vocab.token_to_id[word] = i;
133
- vocab.id_to_token[i] = word;
134
- }
135
- }
136
-
137
- // for the big tensors, we have the option to store the data in 16-bit floats or quantized
138
- // in order to save memory and also to speed up the computation
139
- ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
140
- if (wtype == GGML_TYPE_COUNT) {
141
- fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
142
- __func__, fname.c_str(), model.hparams.ftype);
143
- return false;
144
- }
145
-
146
- auto & ctx = model.ctx;
147
-
148
- size_t ctx_size = 0;
149
-
150
- {
151
- const auto & hparams = model.hparams;
152
-
153
- const int n_embd = hparams.n_embd;
154
- const int n_layer = hparams.n_layer;
155
- const int n_ctx = hparams.n_ctx;
156
- const int n_vocab = hparams.n_vocab;
157
-
158
- ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
159
- ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
160
-
161
- ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // wte
162
- ctx_size += n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe
163
- ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // lm_head
164
-
165
- ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
166
- ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
167
-
168
- ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
169
- ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
170
-
171
- ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w
172
- ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b
173
-
174
- ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
175
- ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b
176
-
177
- ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w
178
- ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b
179
-
180
- ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w
181
- ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_mlp_proj_b
182
-
183
- ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
184
- ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
185
-
186
- ctx_size += (6 + 12*n_layer)*256; // object overhead
187
-
188
- printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
189
- }
190
-
191
- // create the ggml context
192
- {
193
- struct ggml_init_params params = {
194
- /*.mem_size =*/ ctx_size,
195
- /*.mem_buffer =*/ NULL,
196
- /*.no_alloc =*/ false,
197
- };
198
-
199
- model.ctx = ggml_init(params);
200
- if (!model.ctx) {
201
- fprintf(stderr, "%s: ggml_init() failed\n", __func__);
202
- return false;
203
- }
204
- }
205
-
206
- // prepare memory for the weights
207
- {
208
- const auto & hparams = model.hparams;
209
-
210
- const int n_embd = hparams.n_embd;
211
- const int n_layer = hparams.n_layer;
212
- const int n_ctx = hparams.n_ctx;
213
- const int n_vocab = hparams.n_vocab;
214
-
215
- model.layers.resize(n_layer);
216
-
217
- model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
218
- model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
219
-
220
- model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
221
- model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
222
- model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
223
-
224
- // map by name
225
- model.tensors["model/ln_f/g"] = model.ln_f_g;
226
- model.tensors["model/ln_f/b"] = model.ln_f_b;
227
-
228
- model.tensors["model/wte"] = model.wte;
229
- model.tensors["model/wpe"] = model.wpe;
230
- model.tensors["model/lm_head"] = model.lm_head;
231
-
232
- for (int i = 0; i < n_layer; ++i) {
233
- auto & layer = model.layers[i];
234
-
235
- layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
236
- layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
237
-
238
- layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
239
- layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
240
-
241
- layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
242
- layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
243
-
244
- layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
245
- layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
246
-
247
- layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
248
- layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
249
-
250
- layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
251
- layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
252
-
253
- // map by name
254
- model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
255
- model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b;
256
-
257
- model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g;
258
- model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b;
259
-
260
- model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
261
- model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
262
-
263
- model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
264
- model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
265
-
266
- model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
267
- model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
268
-
269
- model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w;
270
- model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
271
- }
272
- }
273
-
274
- // key + value memory
275
- {
276
- const auto & hparams = model.hparams;
277
-
278
- const int n_embd = hparams.n_embd;
279
- const int n_layer = hparams.n_layer;
280
- const int n_ctx = hparams.n_ctx;
281
-
282
- const int n_mem = n_layer*n_ctx;
283
- const int n_elements = n_embd*n_mem;
284
-
285
- model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
286
- model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
287
-
288
- const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
289
-
290
- printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
291
- }
292
-
293
- // load weights
294
- {
295
- size_t total_size = 0;
296
-
297
- bool has_lm_head = false;
298
-
299
- while (true) {
300
- int32_t n_dims;
301
- int32_t length;
302
- int32_t ttype;
303
-
304
- fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
305
- fin.read(reinterpret_cast<char *>(&length), sizeof(length));
306
- fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
307
-
308
- if (fin.eof()) {
309
- break;
310
- }
311
-
312
- int32_t nelements = 1;
313
- int32_t ne[2] = { 1, 1 };
314
- for (int i = 0; i < n_dims; ++i) {
315
- fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
316
- nelements *= ne[i];
317
- }
318
-
319
- std::string name(length, 0);
320
- fin.read(&name[0], length);
321
-
322
- if (model.tensors.find(name.data()) == model.tensors.end()) {
323
- fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
324
- return false;
325
- }
326
-
327
- auto tensor = model.tensors[name.data()];
328
- if (ggml_nelements(tensor) != nelements) {
329
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
330
- return false;
331
- }
332
-
333
- if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
334
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
335
- __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
336
- return false;
337
- }
338
-
339
- // for debugging
340
- if (0) {
341
- printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
342
- }
343
-
344
- const size_t bpe = ggml_type_size(ggml_type(ttype));
345
-
346
- if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
347
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
348
- __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
349
- return false;
350
- }
351
-
352
- fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
353
-
354
- // GPT-2 models share the WTE tensor as the LM head
355
- if (name == "model/wte" && has_lm_head == false) {
356
- memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
357
- }
358
-
359
- if (name == "model/lm_head") {
360
- has_lm_head = true;
361
- }
362
-
363
- total_size += ggml_nbytes(tensor);
364
- }
365
-
366
- printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
367
- }
368
-
369
- fin.close();
370
-
371
- return true;
372
- }
373
-
374
- // evaluate the transformer
375
- //
376
- // - model: the model
377
- // - n_threads: number of threads to use
378
- // - n_past: the context size so far
379
- // - embd_inp: the embeddings of the tokens in the context
380
- // - embd_w: the predicted logits for the next token
381
- //
382
- // TODO: sync latest version from ggml repo
383
- static bool gpt2_eval(
384
- const gpt2_model & model,
385
- const int n_threads,
386
- const int n_past,
387
- const std::vector<gpt_vocab::id> & embd_inp,
388
- std::vector<float> & embd_w,
389
- size_t & mem_per_token) {
390
- const int N = embd_inp.size();
391
-
392
- const auto & hparams = model.hparams;
393
-
394
- const int n_embd = hparams.n_embd;
395
- const int n_layer = hparams.n_layer;
396
- const int n_ctx = hparams.n_ctx;
397
- const int n_head = hparams.n_head;
398
- const int n_vocab = hparams.n_vocab;
399
-
400
- static size_t buf_size = 512u*1024*1024;
401
- static void * buf = malloc(buf_size);
402
-
403
- if (mem_per_token > 0 && mem_per_token*N > buf_size) {
404
- const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
405
- //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
406
-
407
- // reallocate
408
- buf_size = buf_size_new;
409
- buf = realloc(buf, buf_size);
410
- if (buf == nullptr) {
411
- fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
412
- return false;
413
- }
414
- }
415
-
416
- struct ggml_init_params params = {
417
- /*.mem_size =*/ buf_size,
418
- /*.mem_buffer =*/ buf,
419
- /*.no_alloc =*/ false,
420
- };
421
-
422
- struct ggml_context * ctx0 = ggml_init(params);
423
- struct ggml_cgraph gf = {};
424
-
425
- struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
426
- memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
427
-
428
- struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
429
- for (int i = 0; i < N; ++i) {
430
- ((int32_t *) position->data)[i] = n_past + i;
431
- }
432
-
433
- // wte + wpe
434
- struct ggml_tensor * inpL =
435
- ggml_add(ctx0,
436
- ggml_get_rows(ctx0, model.wte, embd),
437
- ggml_get_rows(ctx0, model.wpe, position));
438
-
439
- for (int il = 0; il < n_layer; ++il) {
440
- struct ggml_tensor * cur;
441
-
442
- // norm
443
- {
444
- // [ 768, N]
445
- cur = ggml_norm(ctx0, inpL, 1e-5f);
446
-
447
- // cur = ln_1_g*cur + ln_1_b
448
- // [ 768, N]
449
- cur = ggml_add(ctx0,
450
- ggml_mul(ctx0,
451
- ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
452
- cur),
453
- ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
454
- }
455
-
456
- // attn
457
- // [2304, 768] - model.layers[il].c_attn_attn_w
458
- // [2304, 1] - model.layers[il].c_attn_attn_b
459
- // [ 768, N] - cur (in)
460
- // [2304, N] - cur (out)
461
- //
462
- // cur = attn_w*cur + attn_b
463
- // [2304, N]
464
- {
465
- cur = ggml_mul_mat(ctx0,
466
- model.layers[il].c_attn_attn_w,
467
- cur);
468
-
469
- cur = ggml_add(ctx0,
470
- ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
471
- cur);
472
- }
473
-
474
- // self-attention
475
- {
476
- struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
477
- struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
478
- struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
479
-
480
- // store key and value to memory
481
- if (N >= 1) {
482
- struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
483
- struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
484
-
485
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
486
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
487
- }
488
-
489
- // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
490
- // [64, N, 12]
491
- struct ggml_tensor * Q =
492
- ggml_permute(ctx0,
493
- ggml_cpy(ctx0,
494
- Qcur,
495
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
496
- 0, 2, 1, 3);
497
-
498
- // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
499
- // [64, n_past + N, 12]
500
- struct ggml_tensor * K =
501
- ggml_permute(ctx0,
502
- ggml_reshape_3d(ctx0,
503
- ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
504
- n_embd/n_head, n_head, n_past + N),
505
- 0, 2, 1, 3);
506
-
507
- // GG: flash attention
508
- //struct ggml_tensor * V =
509
- // ggml_cpy(ctx0,
510
- // ggml_permute(ctx0,
511
- // ggml_reshape_3d(ctx0,
512
- // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
513
- // n_embd/n_head, n_head, n_past + N),
514
- // 1, 2, 0, 3),
515
- // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
516
-
517
- //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
518
-
519
- // K * Q
520
- // [n_past + N, N, 12]
521
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
522
-
523
- // KQ_scaled = KQ / sqrt(n_embd/n_head)
524
- // [n_past + N, N, 12]
525
- struct ggml_tensor * KQ_scaled =
526
- ggml_scale(ctx0,
527
- KQ,
528
- 1.0f/sqrt(float(n_embd)/n_head));
529
-
530
- // KQ_masked = mask_past(KQ_scaled)
531
- // [n_past + N, N, 12]
532
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
533
-
534
- // KQ = soft_max(KQ_masked)
535
- // [n_past + N, N, 12]
536
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
537
-
538
- // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
539
- // [n_past + N, 64, 12]
540
- struct ggml_tensor * V_trans =
541
- ggml_cpy(ctx0,
542
- ggml_permute(ctx0,
543
- ggml_reshape_3d(ctx0,
544
- ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
545
- n_embd/n_head, n_head, n_past + N),
546
- 1, 2, 0, 3),
547
- ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
548
-
549
- // KQV = transpose(V) * KQ_soft_max
550
- // [64, N, 12]
551
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
552
-
553
- // KQV_merged = KQV.permute(0, 2, 1, 3)
554
- // [64, 12, N]
555
- struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
556
-
557
- // cur = KQV_merged.contiguous().view(n_embd, N)
558
- // [768, N]
559
- cur = ggml_cpy(ctx0,
560
- KQV_merged,
561
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
562
- }
563
-
564
- // projection
565
- // [ 768, 768] - model.layers[il].c_attn_proj_w
566
- // [ 768, 1] - model.layers[il].c_attn_proj_b
567
- // [ 768, N] - cur (in)
568
- // [ 768, N] - cur (out)
569
- //
570
- // cur = proj_w*cur + proj_b
571
- // [768, N]
572
- {
573
- cur = ggml_mul_mat(ctx0,
574
- model.layers[il].c_attn_proj_w,
575
- cur);
576
-
577
- cur = ggml_add(ctx0,
578
- ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
579
- cur);
580
- }
581
-
582
- // add the input
583
- cur = ggml_add(ctx0, cur, inpL);
584
-
585
- struct ggml_tensor * inpFF = cur;
586
-
587
- // feed-forward network
588
- {
589
- // norm
590
- {
591
- cur = ggml_norm(ctx0, inpFF, 1e-5f);
592
-
593
- // cur = ln_2_g*cur + ln_2_b
594
- // [ 768, N]
595
- cur = ggml_add(ctx0,
596
- ggml_mul(ctx0,
597
- ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
598
- cur),
599
- ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
600
- }
601
-
602
- // fully connected
603
- // [3072, 768] - model.layers[il].c_mlp_fc_w
604
- // [3072, 1] - model.layers[il].c_mlp_fc_b
605
- // [ 768, N] - cur (in)
606
- // [3072, N] - cur (out)
607
- //
608
- // cur = fc_w*cur + fc_b
609
- // [3072, N]
610
- cur = ggml_mul_mat(ctx0,
611
- model.layers[il].c_mlp_fc_w,
612
- cur);
613
-
614
- cur = ggml_add(ctx0,
615
- ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
616
- cur);
617
-
618
- // GELU activation
619
- // [3072, N]
620
- cur = ggml_gelu(ctx0, cur);
621
-
622
- // projection
623
- // [ 768, 3072] - model.layers[il].c_mlp_proj_w
624
- // [ 768, 1] - model.layers[il].c_mlp_proj_b
625
- // [3072, N] - cur (in)
626
- // [ 768, N] - cur (out)
627
- //
628
- // cur = proj_w*cur + proj_b
629
- // [768, N]
630
- cur = ggml_mul_mat(ctx0,
631
- model.layers[il].c_mlp_proj_w,
632
- cur);
633
-
634
- cur = ggml_add(ctx0,
635
- ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
636
- cur);
637
- }
638
-
639
- // input for next layer
640
- inpL = ggml_add(ctx0, cur, inpFF);
641
- }
642
-
643
- // norm
644
- {
645
- // [ 768, N]
646
- inpL = ggml_norm(ctx0, inpL, 1e-5f);
647
-
648
- // inpL = ln_f_g*inpL + ln_f_b
649
- // [ 768, N]
650
- inpL = ggml_add(ctx0,
651
- ggml_mul(ctx0,
652
- ggml_repeat(ctx0, model.ln_f_g, inpL),
653
- inpL),
654
- ggml_repeat(ctx0, model.ln_f_b, inpL));
655
- }
656
-
657
- // inpL = WTE * inpL
658
- // [ 768, 50257] - model.lm_head
659
- // [ 768, N] - inpL
660
- inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
661
-
662
- // logits -> probs
663
- //inpL = ggml_soft_max(ctx0, inpL);
664
-
665
- // run the computation
666
- ggml_build_forward_expand (&gf, inpL);
667
- ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
668
-
669
- //if (n_past%100 == 0) {
670
- // ggml_graph_print (&gf);
671
- // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
672
- //}
673
-
674
- //embd_w.resize(n_vocab*N);
675
- //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
676
-
677
- // return result just for the last token
678
- embd_w.resize(n_vocab);
679
- memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
680
-
681
- if (mem_per_token == 0) {
682
- mem_per_token = ggml_used_mem(ctx0)/N;
683
- }
684
- //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
685
-
686
- ggml_free(ctx0);
687
-
688
- return true;
689
- }
690
-
691
- /////////////////////////////// GPT-2 END ////////////////////////////////
692
-
693
- constexpr int N_THREAD = 8;
694
-
695
- struct gpt2_context {
696
- std::string prompt_base = R"(Hello, how are you?
697
- I'm fine, thanks. How are you?
698
- Thanks, I'm fine too. What are you doing?
699
- I'm just sitting here.
700
- It's a lovely day, isn't it?
701
- Yes, it is. I love the weather this time of year.
702
- I wish it would rain a little bit.
703
- Me too.
704
- )";
705
-
706
- std::mt19937 rng;
707
-
708
- gpt_vocab vocab;
709
- gpt2_model model;
710
-
711
- int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
712
-
713
- // sampling parameters
714
- int32_t top_k = 5;
715
- float top_p = 0.9f;
716
- float temp = 1.0f;
717
- };
718
-
719
- struct gpt2_context * gpt2_init(const char * path_model) {
720
- gpt2_context * ctx = new gpt2_context;
721
-
722
- ctx->rng = std::mt19937(time(nullptr));
723
-
724
- // load the model
725
- {
726
- const int64_t t_start_us = ggml_time_us();
727
-
728
- if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) {
729
- fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, path_model);
730
- delete ctx;
731
- return nullptr;
732
- }
733
-
734
- const int64_t t_load_us = ggml_time_us() - t_start_us;
735
-
736
- printf("gpt-2: model loaded in %d ms\n", (int) (t_load_us/1000));
737
- }
738
-
739
- return ctx;
740
- }
741
-
742
- void gpt2_free(struct gpt2_context * ctx) {
743
- delete ctx;
744
- }
745
-
746
- const char * gpt2_get_prompt(struct gpt2_context * ctx) {
747
- return ctx->prompt_base.c_str();
748
- }
749
-
750
- void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt) {
751
- ctx->prompt_base = prompt;
752
- }
753
-
754
- std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text) {
755
- return ::gpt_tokenize(ctx->vocab, text);
756
- }
757
-
758
- std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens) {
759
- int n_past = 0;
760
-
761
- std::vector<float> embd_w;
762
-
763
- // tokenize the prompt
764
- std::vector<gpt_vocab::id> embd_inp = ::gpt2_tokenize(ctx, text);
765
-
766
- int n_predict = std::min(max_tokens, ctx->model.hparams.n_ctx - (int) embd_inp.size());
767
-
768
- std::vector<gpt_vocab::id> embd = embd_inp;
769
-
770
- size_t mem_per_token = 3000000;
771
-
772
- std::string result;
773
-
774
- for (int i = embd.size(); i < (int) embd_inp.size() + n_predict; i++) {
775
- // predict
776
- if (!embd.empty()) {
777
- if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {
778
- printf("gpt-2: failed to generate text\n");
779
- return "";
780
- }
781
- }
782
-
783
- n_past += embd.size();
784
- embd.clear();
785
-
786
- {
787
- // sample next token
788
- const int top_k = ctx->top_k;
789
- const float top_p = ctx->top_p;
790
- const float temp = ctx->temp;
791
-
792
- const int n_vocab = ctx->model.hparams.n_vocab;
793
-
794
- const gpt_vocab::id id = gpt_sample_top_k_top_p(ctx->vocab, embd_w.data() + (embd_w.size() - n_vocab), top_k, top_p, temp, ctx->rng);
795
-
796
- // add it to the context
797
- embd.push_back(id);
798
- }
799
-
800
- result += ctx->vocab.id_to_token[embd[0]];
801
-
802
- // end of text token
803
- if (embd.back() == 50256) {
804
- break;
805
- }
806
- }
807
-
808
- return result;
809
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/talk/gpt-2.h DELETED
@@ -1,21 +0,0 @@
1
- #pragma once
2
-
3
- // TODO: Change to C-style API and move to ./examples for easy reuse.
4
-
5
- #include "common.h"
6
-
7
- #include <vector>
8
- #include <map>
9
- #include <string>
10
-
11
- struct gpt2_context;
12
-
13
- struct gpt2_context * gpt2_init(const char * path_model);
14
- void gpt2_free(struct gpt2_context * ctx);
15
-
16
- const char * gpt2_get_prompt(struct gpt2_context * ctx);
17
- void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt);
18
-
19
- std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text);
20
-
21
- std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/talk/speak DELETED
@@ -1,40 +0,0 @@
1
- #!/bin/bash
2
-
3
- # Usage:
4
- # speak <voice_id> <textfile>
5
-
6
- function installed() { command -v $1 >/dev/null 2>&1; }
7
-
8
- if installed espeak; then
9
- espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 -f $2
10
-
11
- elif installed piper && installed aplay; then
12
- cat $2 | piper --model ~/en_US-lessac-medium.onnx --output-raw | aplay -q -r 22050 -f S16_LE -t raw -
13
-
14
- # for Mac
15
- elif installed say; then
16
- say -f $2
17
-
18
- # Eleven Labs
19
- elif installed python3 && \
20
- python3 -c 'import importlib.util; exit(not importlib.util.find_spec("elevenlabs"))' && \
21
- installed ffplay; then
22
- # It's possible to use the API for free with limited number of characters.
23
- # To increase this limit register to https://beta.elevenlabs.io to get an api key
24
- # and paste it after 'ELEVEN_API_KEY='
25
- # Keep the line commented to use the free version without api key
26
- #export ELEVEN_API_KEY=your_api_key
27
- wd=$(dirname $0)
28
- script=$wd/eleven-labs.py
29
- python3 $script -q -p -v $1 $2 >/dev/null 2>&1
30
-
31
- # Uncomment to keep the audio file
32
- #python3 $script -q -s ./audio.mp3 -v $1 $2 >/dev/null 2>&1
33
- #ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
34
-
35
- else
36
- echo 'Install espeak ("brew install espeak" or "apt-get install espeak"),'
37
- echo 'piper ("pip install piper-tts" or https://github.com/rhasspy/piper) with aplay,'
38
- echo 'or elevenlabs ("pip install elevenlabs") with ffplay.'
39
- echo '(export ELEVEN_API_KEY if you have an api key from https://beta.elevenlabs.io)'
40
- fi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/talk/speak.bat DELETED
@@ -1 +0,0 @@
1
- @powershell -ExecutionPolicy Bypass -F examples\talk\speak.ps1 %1 %2
 
 
examples/talk/speak.ps1 DELETED
@@ -1,14 +0,0 @@
1
- # Set-ExecutionPolicy -ExecutionPolicy Bypass -Scope CurrentUser
2
- param(
3
- [Parameter(Mandatory=$true)][int]$voicenum,
4
- [Parameter(Mandatory=$true)][string]$textfile
5
- )
6
-
7
- Add-Type -AssemblyName System.Speech;
8
- $speak = New-Object System.Speech.Synthesis.SpeechSynthesizer;
9
- $voiceoptions = $speak.GetInstalledVoices("en-US");
10
- $voice = $voiceoptions[$voicenum % $voiceoptions.count];
11
- $speak.SelectVoice($voice.VoiceInfo.Name);
12
- $speak.Rate="0";
13
- $text = Get-Content -Path $textfile;
14
- $speak.Speak($text);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/talk/talk.cpp DELETED
@@ -1,376 +0,0 @@
1
- // Talk with AI
2
- //
3
-
4
- #include "common-sdl.h"
5
- #include "common.h"
6
- #include "whisper.h"
7
- #include "gpt-2.h"
8
-
9
- #include <cassert>
10
- #include <cstdio>
11
- #include <fstream>
12
- #include <regex>
13
- #include <string>
14
- #include <thread>
15
- #include <vector>
16
- #include <regex>
17
-
18
- // command-line parameters
19
- struct whisper_params {
20
- int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
21
- int32_t voice_ms = 10000;
22
- int32_t capture_id = -1;
23
- int32_t max_tokens = 32;
24
- int32_t audio_ctx = 0;
25
-
26
- float vad_thold = 0.6f;
27
- float freq_thold = 100.0f;
28
-
29
- bool translate = false;
30
- bool print_special = false;
31
- bool print_energy = false;
32
- bool no_timestamps = true;
33
- bool use_gpu = true;
34
- bool flash_attn = false;
35
-
36
- std::string person = "Santa";
37
- std::string language = "en";
38
- std::string model_wsp = "models/ggml-base.en.bin";
39
- std::string model_gpt = "models/ggml-gpt-2-117M.bin";
40
- std::string speak = "./examples/talk/speak";
41
- std::string speak_file= "./examples/talk/to_speak.txt";
42
- std::string fname_out;
43
- };
44
-
45
- void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
46
-
47
- static bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
48
- for (int i = 1; i < argc; i++) {
49
- std::string arg = argv[i];
50
-
51
- if (arg == "-h" || arg == "--help") {
52
- whisper_print_usage(argc, argv, params);
53
- exit(0);
54
- }
55
- else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
56
- else if (arg == "-vms" || arg == "--voice-ms") { params.voice_ms = std::stoi(argv[++i]); }
57
- else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
58
- else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
59
- else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
60
- else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
61
- else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
62
- else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
63
- else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
64
- else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
65
- else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
66
- else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; }
67
- else if (arg == "-p" || arg == "--person") { params.person = argv[++i]; }
68
- else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
69
- else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; }
70
- else if (arg == "-mg" || arg == "--model-gpt") { params.model_gpt = argv[++i]; }
71
- else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; }
72
- else if (arg == "-sf" || arg == "--speak_file") { params.speak_file = argv[++i]; }
73
- else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
74
- else {
75
- fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
76
- whisper_print_usage(argc, argv, params);
77
- exit(0);
78
- }
79
- }
80
-
81
- return true;
82
- }
83
-
84
- void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
85
- fprintf(stderr, "\n");
86
- fprintf(stderr, "usage: %s [options]\n", argv[0]);
87
- fprintf(stderr, "\n");
88
- fprintf(stderr, "options:\n");
89
- fprintf(stderr, " -h, --help [default] show this help message and exit\n");
90
- fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
91
- fprintf(stderr, " -vms N, --voice-ms N [%-7d] voice duration in milliseconds\n", params.voice_ms);
92
- fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
93
- fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
94
- fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
95
- fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
96
- fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
97
- fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
98
- fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
99
- fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
100
- fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
101
- fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention\n", params.flash_attn ? "true" : "false");
102
- fprintf(stderr, " -p NAME, --person NAME [%-7s] person name (for prompt selection)\n", params.person.c_str());
103
- fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
104
- fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
105
- fprintf(stderr, " -mg FILE, --model-gpt [%-7s] gpt model file\n", params.model_gpt.c_str());
106
- fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
107
- fprintf(stderr, " -sf FILE, --speak_file [%-7s] file to pass to TTS\n", params.speak_file.c_str());
108
- fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
109
- fprintf(stderr, "\n");
110
- }
111
-
112
- static std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
113
- const auto t_start = std::chrono::high_resolution_clock::now();
114
-
115
- prob = 0.0f;
116
- t_ms = 0;
117
-
118
- whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
119
-
120
- wparams.print_progress = false;
121
- wparams.print_special = params.print_special;
122
- wparams.print_realtime = false;
123
- wparams.print_timestamps = !params.no_timestamps;
124
- wparams.translate = params.translate;
125
- wparams.no_context = true;
126
- wparams.single_segment = true;
127
- wparams.max_tokens = params.max_tokens;
128
- wparams.language = params.language.c_str();
129
- wparams.n_threads = params.n_threads;
130
-
131
- wparams.audio_ctx = params.audio_ctx;
132
-
133
- if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
134
- return "";
135
- }
136
-
137
- int prob_n = 0;
138
- std::string result;
139
-
140
- const int n_segments = whisper_full_n_segments(ctx);
141
- for (int i = 0; i < n_segments; ++i) {
142
- const char * text = whisper_full_get_segment_text(ctx, i);
143
-
144
- result += text;
145
-
146
- const int n_tokens = whisper_full_n_tokens(ctx, i);
147
- for (int j = 0; j < n_tokens; ++j) {
148
- const auto token = whisper_full_get_token_data(ctx, i, j);
149
-
150
- prob += token.p;
151
- ++prob_n;
152
- }
153
- }
154
-
155
- if (prob_n > 0) {
156
- prob /= prob_n;
157
- }
158
-
159
- const auto t_end = std::chrono::high_resolution_clock::now();
160
- t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
161
-
162
- return result;
163
- }
164
-
165
- const std::string k_prompt =
166
- R"(This is a dialogue between {0} (A) and a person (B). The dialogue so far is:
167
-
168
- B: Hello {0}, how are you?
169
- A: I'm fine, thank you.
170
- {1}
171
- Here is how {0} (A) continues the dialogue:
172
-
173
- A:)";
174
-
175
- int main(int argc, char ** argv) {
176
- whisper_params params;
177
-
178
- if (whisper_params_parse(argc, argv, params) == false) {
179
- return 1;
180
- }
181
-
182
- if (whisper_lang_id(params.language.c_str()) == -1) {
183
- fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
184
- whisper_print_usage(argc, argv, params);
185
- exit(0);
186
- }
187
-
188
- // whisper init
189
- struct whisper_context_params cparams = whisper_context_default_params();
190
-
191
- cparams.use_gpu = params.use_gpu;
192
- cparams.flash_attn = params.flash_attn;
193
-
194
- struct whisper_context * ctx_wsp = whisper_init_from_file_with_params(params.model_wsp.c_str(), cparams);
195
-
196
- // gpt init
197
-
198
- struct gpt2_context * ctx_gpt = gpt2_init(params.model_gpt.c_str());
199
-
200
- // print some info about the processing
201
- {
202
- fprintf(stderr, "\n");
203
- if (!whisper_is_multilingual(ctx_wsp)) {
204
- if (params.language != "en" || params.translate) {
205
- params.language = "en";
206
- params.translate = false;
207
- fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
208
- }
209
- }
210
- fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
211
- __func__,
212
- params.n_threads,
213
- params.language.c_str(),
214
- params.translate ? "translate" : "transcribe",
215
- params.no_timestamps ? 0 : 1);
216
-
217
- fprintf(stderr, "\n");
218
- }
219
-
220
-
221
- // init audio
222
-
223
- audio_async audio(30*1000);
224
- if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
225
- fprintf(stderr, "%s: audio.init() failed!\n", __func__);
226
- return 1;
227
- }
228
-
229
- audio.resume();
230
-
231
- int n_iter = 0;
232
-
233
- bool is_running = true;
234
- bool force_speak = false;
235
-
236
- float prob0 = 0.0f;
237
-
238
- std::vector<float> pcmf32_cur;
239
- std::vector<float> pcmf32_prompt;
240
-
241
- gpt2_set_prompt(ctx_gpt, "");
242
-
243
- const int voice_id = rand()%6;
244
-
245
- fprintf(stderr, "gpt-2: prompt:\n");
246
- fprintf(stderr, "========================\n\n");
247
- fprintf(stderr, "%s\n", ::replace(k_prompt, "{0}", params.person).c_str());
248
- fprintf(stderr, "========================\n\n");
249
-
250
- // main loop
251
- while (is_running) {
252
- // handle Ctrl + C
253
- is_running = sdl_poll_events();
254
-
255
- if (!is_running) {
256
- break;
257
- }
258
-
259
- // delay
260
- std::this_thread::sleep_for(std::chrono::milliseconds(100));
261
-
262
- int64_t t_ms = 0;
263
-
264
- {
265
- audio.get(2000, pcmf32_cur);
266
-
267
- if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
268
- fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
269
-
270
- audio.get(params.voice_ms, pcmf32_cur);
271
-
272
- std::string text_heard;
273
-
274
- if (!force_speak) {
275
- text_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prob0, t_ms));
276
- }
277
-
278
- // remove text between brackets using regex
279
- {
280
- std::regex re("\\[.*?\\]");
281
- text_heard = std::regex_replace(text_heard, re, "");
282
- }
283
-
284
- // remove text between brackets using regex
285
- {
286
- std::regex re("\\(.*?\\)");
287
- text_heard = std::regex_replace(text_heard, re, "");
288
- }
289
-
290
- // remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
291
- text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
292
-
293
- // take first line
294
- text_heard = text_heard.substr(0, text_heard.find_first_of('\n'));
295
-
296
- // remove leading and trailing whitespace
297
- text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
298
- text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
299
-
300
- const std::vector<gpt_vocab::id> tokens = gpt2_tokenize(ctx_gpt, text_heard.c_str());
301
-
302
- if (text_heard.empty() || tokens.empty() || force_speak) {
303
- fprintf(stdout, "%s: Heard nothing, skipping ...\n", __func__);
304
- audio.clear();
305
-
306
- continue;
307
- }
308
-
309
- force_speak = false;
310
-
311
- fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", text_heard.c_str(), "\033[0m", (int) t_ms);
312
-
313
- std::string prompt_base = gpt2_get_prompt(ctx_gpt);
314
-
315
- std::string text_to_speak;
316
-
317
- {
318
- prompt_base += "B: " + text_heard + "\n";
319
-
320
- std::string prompt = ::replace(::replace(k_prompt, "{0}", params.person), "{1}", prompt_base);
321
-
322
- text_to_speak = gpt2_gen_text(ctx_gpt, prompt.c_str(), params.max_tokens);
323
- //text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
324
- text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of('\n'));
325
-
326
- // remove first 2 lines of base prompt
327
- if (n_iter > 4) {
328
- {
329
- const size_t pos = prompt_base.find_first_of('\n');
330
- if (pos != std::string::npos) {
331
- prompt_base = prompt_base.substr(pos + 1);
332
- }
333
- }
334
- {
335
- const size_t pos = prompt_base.find_first_of('\n');
336
- if (pos != std::string::npos) {
337
- prompt_base = prompt_base.substr(pos + 1);
338
- }
339
- }
340
- }
341
-
342
- prompt_base += "A:" + text_to_speak + "\n";
343
-
344
- {
345
- prompt = ::replace(::replace(k_prompt, "{0}", params.person), "{1}", prompt_base);
346
-
347
- printf("===============\n");
348
- printf("prompt:\n");
349
- printf("%s\n", prompt.c_str());
350
- printf("===============\n");
351
- }
352
- }
353
-
354
- //printf("========================\n");
355
- //printf("gpt-2: prompt_base:\n%s\n", prompt_base.c_str());
356
- //printf("========================\n");
357
-
358
- gpt2_set_prompt(ctx_gpt, prompt_base.c_str());
359
-
360
- text_to_speak = ::replace(text_to_speak, params.person + ": ", "");
361
- speak_with_file(params.speak, text_to_speak, params.speak_file, voice_id);
362
-
363
- audio.clear();
364
-
365
- ++n_iter;
366
- }
367
- }
368
- }
369
-
370
- audio.pause();
371
-
372
- whisper_print_timings(ctx_wsp);
373
- whisper_free(ctx_wsp);
374
-
375
- return 0;
376
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/twitch.sh CHANGED
@@ -29,7 +29,7 @@ help()
29
 
30
  check_requirements()
31
  {
32
- if ! command -v ./main &>/dev/null; then
33
  echo "whisper.cpp main executable is required (make)"
34
  exit 1
35
  fi
@@ -100,7 +100,7 @@ do
100
  err=$(cat /tmp/whisper-live.err | wc -l)
101
  done
102
 
103
- ./main -t $threads -m ./models/ggml-$model.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1
104
 
105
  while [ $SECONDS -lt $((($i+1)*$step)) ]; do
106
  sleep 1
 
29
 
30
  check_requirements()
31
  {
32
+ if ! command -v ./build/bin/whisper-cli &>/dev/null; then
33
  echo "whisper.cpp main executable is required (make)"
34
  exit 1
35
  fi
 
100
  err=$(cat /tmp/whisper-live.err | wc -l)
101
  done
102
 
103
+ ./build/bin/whisper-cli -t $threads -m ./models/ggml-$model.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1
104
 
105
  while [ $SECONDS -lt $((($i+1)*$step)) ]; do
106
  sleep 1
examples/yt-wsp.sh CHANGED
@@ -55,7 +55,7 @@ MODEL_PATH="${MODEL_PATH:-${SCRIPT_DIR}/../models/ggml-base.en.bin}"
55
  # Where to find the whisper.cpp executable. default to the examples directory
56
  # which holds this script in source control
57
  ################################################################################
58
- WHISPER_EXECUTABLE="${WHISPER_EXECUTABLE:-${SCRIPT_DIR}/../main}";
59
 
60
  # Set to desired language to be translated into english
61
  WHISPER_LANG="${WHISPER_LANG:-en}";
 
55
  # Where to find the whisper.cpp executable. default to the examples directory
56
  # which holds this script in source control
57
  ################################################################################
58
+ WHISPER_EXECUTABLE="${WHISPER_EXECUTABLE:-${SCRIPT_DIR}/../build/bin/whisper-cli}";
59
 
60
  # Set to desired language to be translated into english
61
  WHISPER_LANG="${WHISPER_LANG:-en}";
scripts/bench-all.sh CHANGED
@@ -38,13 +38,13 @@ if [ "$encoder_only" -eq 0 ]; then
38
  printf "Running memcpy benchmark\n"
39
  printf "\n"
40
 
41
- ./build/bin/bench -w 1 -t $n_threads 2>&1
42
 
43
  printf "\n"
44
  printf "Running ggml_mul_mat benchmark with $n_threads threads\n"
45
  printf "\n"
46
 
47
- ./build/bin/bench -w 2 -t $n_threads 2>&1
48
 
49
  printf "\n"
50
  printf "Running benchmark for all models\n"
@@ -64,7 +64,7 @@ printf "| %6s | %6s | %16s | %13s | %3s | %3s | %7s | %7s | %7s | %7s | %7s |\n"
64
  for model in "${models[@]}"; do
65
  # actual run
66
  # store stderr output in a variable in order to parse it later
67
- output=$(./build/bin/bench -m ./models/ggml-$model.bin -t $n_threads $fattn 2>&1)
68
  ret=$?
69
 
70
  # parse the output:
 
38
  printf "Running memcpy benchmark\n"
39
  printf "\n"
40
 
41
+ ./build/bin/whisper-bench -w 1 -t $n_threads 2>&1
42
 
43
  printf "\n"
44
  printf "Running ggml_mul_mat benchmark with $n_threads threads\n"
45
  printf "\n"
46
 
47
+ ./build/bin/whisper-bench -w 2 -t $n_threads 2>&1
48
 
49
  printf "\n"
50
  printf "Running benchmark for all models\n"
 
64
  for model in "${models[@]}"; do
65
  # actual run
66
  # store stderr output in a variable in order to parse it later
67
+ output=$(./build/bin/whisper-bench -m ./models/ggml-$model.bin -t $n_threads $fattn 2>&1)
68
  ret=$?
69
 
70
  # parse the output:
scripts/bench-wts.sh CHANGED
@@ -22,7 +22,7 @@ echo "Input file duration: ${DURATION}s"
22
 
23
  for model in $models; do
24
  echo "Running $model"
25
- COMMAND="./main -m models/ggml-$model.bin -owts -f $1 -of $1.$model"
26
 
27
  if [ ! -z "$2" ]; then
28
  COMMAND="$COMMAND -fp $2"
 
22
 
23
  for model in $models; do
24
  echo "Running $model"
25
+ COMMAND="./build/bin/whisper-cli -m models/ggml-$model.bin -owts -f $1 -of $1.$model"
26
 
27
  if [ ! -z "$2" ]; then
28
  COMMAND="$COMMAND -fp $2"
scripts/bench.py CHANGED
@@ -148,7 +148,7 @@ for model in filtered_models:
148
  for thread in threads:
149
  for processor_count in processors:
150
  # Construct the command to run
151
- cmd = f"./main -m models/{model} -t {thread} -p {processor_count} -f {sample_file}"
152
  # Run the command and get the output
153
  process = subprocess.Popen(
154
  cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
 
148
  for thread in threads:
149
  for processor_count in processors:
150
  # Construct the command to run
151
+ cmd = f"./build/bin/whisper-cli -m models/{model} -t {thread} -p {processor_count} -f {sample_file}"
152
  # Run the command and get the output
153
  process = subprocess.Popen(
154
  cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
scripts/quantize-all.sh CHANGED
@@ -19,8 +19,8 @@ for i in `ls ./models | grep ^ggml-.*.bin | grep -v "\-q"`; do
19
  m="models/$i"
20
  if [ -f "$m" ]; then
21
  if [ "${m##*.}" == "bin" ]; then
22
- ./quantize "${m}" "${m::${#m}-4}-${qtype1}.bin" ${qtype1};
23
- ./quantize "${m}" "${m::${#m}-4}-${qtype0}.bin" ${qtype0};
24
  filedex+=( "${m::${#m}-4}-${qtype1}.bin" "${m::${#m}-4}-${qtype0}.bin" )
25
  fi
26
  fi
 
19
  m="models/$i"
20
  if [ -f "$m" ]; then
21
  if [ "${m##*.}" == "bin" ]; then
22
+ ./build/bin/whisper-quantize "${m}" "${m::${#m}-4}-${qtype1}.bin" ${qtype1};
23
+ ./build/bin/whisper-quantize "${m}" "${m::${#m}-4}-${qtype0}.bin" ${qtype0};
24
  filedex+=( "${m::${#m}-4}-${qtype1}.bin" "${m::${#m}-4}-${qtype0}.bin" )
25
  fi
26
  fi
tests/run-tests.sh CHANGED
@@ -39,7 +39,7 @@ if [ $# -eq 0 ]; then
39
  fi
40
 
41
  model=$1
42
- main="../build/bin/main"
43
 
44
  threads=""
45
  if [ $# -eq 2 ]; then
 
39
  fi
40
 
41
  model=$1
42
+ main="../build/bin/whisper-cli"
43
 
44
  threads=""
45
  if [ $# -eq 2 ]; then