danbev commited on
Commit
dc59673
·
unverified ·
1 Parent(s): ad4ee45

examples : add VAD speech segments example (#3147)

Browse files

This commit adds an example that demonstrates how to use a VAD (Voice
Activity Detection) model to segment an audio file into speech segments.

Resolves: https://github.com/ggml-org/whisper.cpp/issues/3144

examples/CMakeLists.txt CHANGED
@@ -105,6 +105,7 @@ else()
105
  add_subdirectory(bench)
106
  add_subdirectory(server)
107
  add_subdirectory(quantize)
 
108
  if (WHISPER_SDL2)
109
  add_subdirectory(stream)
110
  add_subdirectory(command)
 
105
  add_subdirectory(bench)
106
  add_subdirectory(server)
107
  add_subdirectory(quantize)
108
+ add_subdirectory(vad-speech-segments)
109
  if (WHISPER_SDL2)
110
  add_subdirectory(stream)
111
  add_subdirectory(command)
examples/vad-speech-segments/CMakeLists.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ set(TARGET vad-speech-segments)
2
+ add_executable(${TARGET} speech.cpp)
3
+
4
+ include(DefaultTargetOptions)
5
+
6
+ target_link_libraries(${TARGET} PRIVATE common whisper ${FFMPEG_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
7
+
8
+ install(TARGETS ${TARGET} RUNTIME)
examples/vad-speech-segments/README.md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # whisper.cpp/examples/vad-speech-segments
2
+
3
+ This examples demonstrates how to use a VAD (Voice Activity Detection) model to
4
+ segment an audio file into speech segments.
5
+
6
+ ### Building the example
7
+ The example can be built using the following command:
8
+ ```console
9
+ cmake -S . -B build
10
+ cmake --build build -j8 --target vad-speech-segments
11
+ ```
12
+
13
+ ### Running the example
14
+ The examples can be run using the following command, which uses a model
15
+ that we use internally for testing:
16
+ ```console
17
+ ./build/bin/vad-speech-segments \
18
+ -vad-model models/for-tests-silero-v5.1.2-ggml.bin \
19
+ --file samples/jfk.wav \
20
+ --no-prints
21
+
22
+ Detected 5 speech segments:
23
+ Speech segment 0: start = 0.29, end = 2.21
24
+ Speech segment 1: start = 3.30, end = 3.77
25
+ Speech segment 2: start = 4.00, end = 4.35
26
+ Speech segment 3: start = 5.38, end = 7.65
27
+ Speech segment 4: start = 8.16, end = 10.59
28
+ ```
29
+ To see more output from whisper.cpp remove the `--no-prints` argument.
30
+
31
+
32
+ ### Command line options
33
+ ```console
34
+ ./build/bin/vad-speech-segments --help
35
+
36
+ usage: ./build/bin/vad-speech-segments [options] file
37
+ supported audio formats: flac, mp3, ogg, wav
38
+
39
+ options:
40
+ -h, --help [default] show this help message and exit
41
+ -f FNAME, --file FNAME [ ] input audio file path
42
+ -t N, --threads N [4 ] number of threads to use during computation
43
+ -ug, --use-gpu [true ] use GPU
44
+ -vm FNAME, --vad-model FNAME [ ] VAD model path
45
+ -vt N, --vad-threshold N [0.50 ] VAD threshold for speech recognition
46
+ -vspd N, --vad-min-speech-duration-ms N [250 ] VAD min speech duration (0.0-1.0)
47
+ -vsd N, --vad-min-silence-duration-ms N [100 ] VAD min silence duration (to split segments)
48
+ -vmsd N, --vad-max-speech-duration-s N [FLT_MAX] VAD max speech duration (auto-split longer)
49
+ -vp N, --vad-speech-pad-ms N [30 ] VAD speech padding (extend segments)
50
+ -vo N, --vad-samples-overlap N [0.10 ] VAD samples overlap (seconds between segments)
51
+ -np, --no-prints [false ] do not print anything other than the results
52
+ ```
examples/vad-speech-segments/speech.cpp ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "common.h"
2
+ #include "common-whisper.h"
3
+
4
+ #include "whisper.h"
5
+
6
+ #include <cstdio>
7
+ #include <cfloat>
8
+ #include <string>
9
+
10
+ // command-line parameters
11
+ struct cli_params {
12
+ int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
13
+ std::string vad_model = "";
14
+ float vad_threshold = 0.5f;
15
+ int vad_min_speech_duration_ms = 250;
16
+ int vad_min_silence_duration_ms = 100;
17
+ float vad_max_speech_duration_s = FLT_MAX;
18
+ int vad_speech_pad_ms = 30;
19
+ float vad_samples_overlap = 0.1f;
20
+ bool use_gpu = false;
21
+ std::string fname_inp = {};
22
+ bool no_prints = false;
23
+ };
24
+
25
+ static void vad_print_usage(int /*argc*/, char ** argv, const cli_params & params) {
26
+ fprintf(stderr, "\n");
27
+ fprintf(stderr, "usage: %s [options] file\n", argv[0]);
28
+ fprintf(stderr, "supported audio formats: flac, mp3, ogg, wav\n");
29
+ fprintf(stderr, "\n");
30
+ fprintf(stderr, "options:\n");
31
+ fprintf(stderr, " -h, --help [default] show this help message and exit\n");
32
+ fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input audio file path\n", "");
33
+ fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
34
+ fprintf(stderr, " -ug, --use-gpu [%-7s] use GPU\n", params.use_gpu ? "true" : "false");
35
+ fprintf(stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n", params.vad_model.c_str());
36
+ fprintf(stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n", params.vad_threshold);
37
+ fprintf(stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n", params.vad_min_speech_duration_ms);
38
+ fprintf(stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n", params.vad_min_silence_duration_ms);
39
+ fprintf(stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n", params.vad_max_speech_duration_s == FLT_MAX ?
40
+ std::string("FLT_MAX").c_str() :
41
+ std::to_string(params.vad_max_speech_duration_s).c_str());
42
+ fprintf(stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n", params.vad_speech_pad_ms);
43
+ fprintf(stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap);
44
+ fprintf(stderr, " -np, --no-prints [%-7s] do not print anything other than the results\n", params.no_prints ? "true" : "false");
45
+ fprintf(stderr, "\n");
46
+ }
47
+
48
+ static char * requires_value_error(const std::string & arg) {
49
+ fprintf(stderr, "error: argument %s requires value\n", arg.c_str());
50
+ exit(0);
51
+ }
52
+
53
+ static bool vad_params_parse(int argc, char ** argv, cli_params & params) {
54
+ for (int i = 1; i < argc; i++) {
55
+ std::string arg = argv[i];
56
+
57
+ if (arg == "-h" || arg == "--help") {
58
+ vad_print_usage(argc, argv, params);
59
+ exit(0);
60
+ }
61
+ #define ARGV_NEXT (((i + 1) < argc) ? argv[++i] : requires_value_error(arg))
62
+ else if (arg == "-f" || arg == "--file") { params.fname_inp = ARGV_NEXT; }
63
+ else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(ARGV_NEXT); }
64
+ else if (arg == "-ug" || arg == "--use-gpu") { params.use_gpu = true; }
65
+ else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = ARGV_NEXT; }
66
+ else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(ARGV_NEXT); }
67
+ else if (arg == "-vsd" || arg == "--vad-min-speech-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(ARGV_NEXT); }
68
+ else if (arg == "-vsd" || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(ARGV_NEXT); }
69
+ else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s") { params.vad_max_speech_duration_s = std::stof(ARGV_NEXT); }
70
+ else if (arg == "-vp" || arg == "--vad-speech-pad-ms") { params.vad_speech_pad_ms = std::stoi(ARGV_NEXT); }
71
+ else if (arg == "-vo" || arg == "--vad-samples-overlap") { params.vad_samples_overlap = std::stof(ARGV_NEXT); }
72
+ else if (arg == "-np" || arg == "--no-prints") { params.no_prints = true; }
73
+ else {
74
+ fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
75
+ vad_print_usage(argc, argv, params);
76
+ exit(0);
77
+ }
78
+ }
79
+
80
+ return true;
81
+ }
82
+
83
+ static void cb_log_disable(enum ggml_log_level , const char * , void * ) { }
84
+
85
+ int main(int argc, char ** argv) {
86
+ cli_params cli_params;
87
+
88
+ if (!vad_params_parse(argc, argv, cli_params)) {
89
+ vad_print_usage(argc, argv, cli_params);
90
+ return 1;
91
+ }
92
+
93
+ if (cli_params.no_prints) {
94
+ whisper_log_set(cb_log_disable, NULL);
95
+ }
96
+
97
+ // Load the input sample audio file.
98
+ std::vector<float> pcmf32;
99
+ std::vector<std::vector<float>> pcmf32s;
100
+ if (!read_audio_data(cli_params.fname_inp.c_str(), pcmf32, pcmf32s, false)) {
101
+ fprintf(stderr, "error: failed to read audio data from %s\n", cli_params.fname_inp.c_str());
102
+ return 2;
103
+ }
104
+
105
+ // Initialize the context which loads the VAD model.
106
+ struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params();
107
+ ctx_params.n_threads = cli_params.n_threads;
108
+ ctx_params.use_gpu = cli_params.use_gpu;
109
+ struct whisper_vad_context * vctx = whisper_vad_init_from_file_with_params(
110
+ cli_params.vad_model.c_str(),
111
+ ctx_params);
112
+
113
+ // Detect speech in the input audio file.
114
+ if (!whisper_vad_detect_speech(vctx, pcmf32.data(), pcmf32.size())) {
115
+ fprintf(stderr, "error: failed to detect speech\n");
116
+ return 3;
117
+ }
118
+
119
+ // Get the the vad segements using the probabilities that have been computed
120
+ // previously and stored in the whisper_vad_context.
121
+ struct whisper_vad_params params = whisper_vad_default_params();
122
+ params.threshold = cli_params.vad_threshold;
123
+ params.min_speech_duration_ms = cli_params.vad_min_speech_duration_ms;
124
+ params.min_silence_duration_ms = cli_params.vad_min_silence_duration_ms;
125
+ params.max_speech_duration_s = cli_params.vad_max_speech_duration_s;
126
+ params.speech_pad_ms = cli_params.vad_speech_pad_ms;
127
+ params.samples_overlap = cli_params.vad_samples_overlap;
128
+ struct whisper_vad_segments * segments = whisper_vad_segments_from_probs(vctx, params);
129
+
130
+ printf("\n");
131
+ printf("Detected %d speech segments:\n", whisper_vad_segments_n_segments(segments));
132
+ for (int i = 0; i < whisper_vad_segments_n_segments(segments); ++i) {
133
+ printf("Speech segment %d: start = %.2f, end = %.2f\n", i,
134
+ whisper_vad_segments_get_segment_t0(segments, i),
135
+ whisper_vad_segments_get_segment_t1(segments, i));
136
+ }
137
+ printf("\n");
138
+
139
+ whisper_vad_free_segments(segments);
140
+ whisper_vad_free(vctx);
141
+
142
+ return 0;
143
+ }