Spaces:
Running
Running
examples : add VAD speech segments example (#3147)
Browse filesThis commit adds an example that demonstrates how to use a VAD (Voice
Activity Detection) model to segment an audio file into speech segments.
Resolves: https://github.com/ggml-org/whisper.cpp/issues/3144
examples/CMakeLists.txt
CHANGED
|
@@ -105,6 +105,7 @@ else()
|
|
| 105 |
add_subdirectory(bench)
|
| 106 |
add_subdirectory(server)
|
| 107 |
add_subdirectory(quantize)
|
|
|
|
| 108 |
if (WHISPER_SDL2)
|
| 109 |
add_subdirectory(stream)
|
| 110 |
add_subdirectory(command)
|
|
|
|
| 105 |
add_subdirectory(bench)
|
| 106 |
add_subdirectory(server)
|
| 107 |
add_subdirectory(quantize)
|
| 108 |
+
add_subdirectory(vad-speech-segments)
|
| 109 |
if (WHISPER_SDL2)
|
| 110 |
add_subdirectory(stream)
|
| 111 |
add_subdirectory(command)
|
examples/vad-speech-segments/CMakeLists.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set(TARGET vad-speech-segments)
|
| 2 |
+
add_executable(${TARGET} speech.cpp)
|
| 3 |
+
|
| 4 |
+
include(DefaultTargetOptions)
|
| 5 |
+
|
| 6 |
+
target_link_libraries(${TARGET} PRIVATE common whisper ${FFMPEG_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
|
| 7 |
+
|
| 8 |
+
install(TARGETS ${TARGET} RUNTIME)
|
examples/vad-speech-segments/README.md
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# whisper.cpp/examples/vad-speech-segments
|
| 2 |
+
|
| 3 |
+
This examples demonstrates how to use a VAD (Voice Activity Detection) model to
|
| 4 |
+
segment an audio file into speech segments.
|
| 5 |
+
|
| 6 |
+
### Building the example
|
| 7 |
+
The example can be built using the following command:
|
| 8 |
+
```console
|
| 9 |
+
cmake -S . -B build
|
| 10 |
+
cmake --build build -j8 --target vad-speech-segments
|
| 11 |
+
```
|
| 12 |
+
|
| 13 |
+
### Running the example
|
| 14 |
+
The examples can be run using the following command, which uses a model
|
| 15 |
+
that we use internally for testing:
|
| 16 |
+
```console
|
| 17 |
+
./build/bin/vad-speech-segments \
|
| 18 |
+
-vad-model models/for-tests-silero-v5.1.2-ggml.bin \
|
| 19 |
+
--file samples/jfk.wav \
|
| 20 |
+
--no-prints
|
| 21 |
+
|
| 22 |
+
Detected 5 speech segments:
|
| 23 |
+
Speech segment 0: start = 0.29, end = 2.21
|
| 24 |
+
Speech segment 1: start = 3.30, end = 3.77
|
| 25 |
+
Speech segment 2: start = 4.00, end = 4.35
|
| 26 |
+
Speech segment 3: start = 5.38, end = 7.65
|
| 27 |
+
Speech segment 4: start = 8.16, end = 10.59
|
| 28 |
+
```
|
| 29 |
+
To see more output from whisper.cpp remove the `--no-prints` argument.
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
### Command line options
|
| 33 |
+
```console
|
| 34 |
+
./build/bin/vad-speech-segments --help
|
| 35 |
+
|
| 36 |
+
usage: ./build/bin/vad-speech-segments [options] file
|
| 37 |
+
supported audio formats: flac, mp3, ogg, wav
|
| 38 |
+
|
| 39 |
+
options:
|
| 40 |
+
-h, --help [default] show this help message and exit
|
| 41 |
+
-f FNAME, --file FNAME [ ] input audio file path
|
| 42 |
+
-t N, --threads N [4 ] number of threads to use during computation
|
| 43 |
+
-ug, --use-gpu [true ] use GPU
|
| 44 |
+
-vm FNAME, --vad-model FNAME [ ] VAD model path
|
| 45 |
+
-vt N, --vad-threshold N [0.50 ] VAD threshold for speech recognition
|
| 46 |
+
-vspd N, --vad-min-speech-duration-ms N [250 ] VAD min speech duration (0.0-1.0)
|
| 47 |
+
-vsd N, --vad-min-silence-duration-ms N [100 ] VAD min silence duration (to split segments)
|
| 48 |
+
-vmsd N, --vad-max-speech-duration-s N [FLT_MAX] VAD max speech duration (auto-split longer)
|
| 49 |
+
-vp N, --vad-speech-pad-ms N [30 ] VAD speech padding (extend segments)
|
| 50 |
+
-vo N, --vad-samples-overlap N [0.10 ] VAD samples overlap (seconds between segments)
|
| 51 |
+
-np, --no-prints [false ] do not print anything other than the results
|
| 52 |
+
```
|
examples/vad-speech-segments/speech.cpp
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "common.h"
|
| 2 |
+
#include "common-whisper.h"
|
| 3 |
+
|
| 4 |
+
#include "whisper.h"
|
| 5 |
+
|
| 6 |
+
#include <cstdio>
|
| 7 |
+
#include <cfloat>
|
| 8 |
+
#include <string>
|
| 9 |
+
|
| 10 |
+
// command-line parameters
|
| 11 |
+
struct cli_params {
|
| 12 |
+
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
| 13 |
+
std::string vad_model = "";
|
| 14 |
+
float vad_threshold = 0.5f;
|
| 15 |
+
int vad_min_speech_duration_ms = 250;
|
| 16 |
+
int vad_min_silence_duration_ms = 100;
|
| 17 |
+
float vad_max_speech_duration_s = FLT_MAX;
|
| 18 |
+
int vad_speech_pad_ms = 30;
|
| 19 |
+
float vad_samples_overlap = 0.1f;
|
| 20 |
+
bool use_gpu = false;
|
| 21 |
+
std::string fname_inp = {};
|
| 22 |
+
bool no_prints = false;
|
| 23 |
+
};
|
| 24 |
+
|
| 25 |
+
static void vad_print_usage(int /*argc*/, char ** argv, const cli_params & params) {
|
| 26 |
+
fprintf(stderr, "\n");
|
| 27 |
+
fprintf(stderr, "usage: %s [options] file\n", argv[0]);
|
| 28 |
+
fprintf(stderr, "supported audio formats: flac, mp3, ogg, wav\n");
|
| 29 |
+
fprintf(stderr, "\n");
|
| 30 |
+
fprintf(stderr, "options:\n");
|
| 31 |
+
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
| 32 |
+
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input audio file path\n", "");
|
| 33 |
+
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
| 34 |
+
fprintf(stderr, " -ug, --use-gpu [%-7s] use GPU\n", params.use_gpu ? "true" : "false");
|
| 35 |
+
fprintf(stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n", params.vad_model.c_str());
|
| 36 |
+
fprintf(stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n", params.vad_threshold);
|
| 37 |
+
fprintf(stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n", params.vad_min_speech_duration_ms);
|
| 38 |
+
fprintf(stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n", params.vad_min_silence_duration_ms);
|
| 39 |
+
fprintf(stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n", params.vad_max_speech_duration_s == FLT_MAX ?
|
| 40 |
+
std::string("FLT_MAX").c_str() :
|
| 41 |
+
std::to_string(params.vad_max_speech_duration_s).c_str());
|
| 42 |
+
fprintf(stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n", params.vad_speech_pad_ms);
|
| 43 |
+
fprintf(stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap);
|
| 44 |
+
fprintf(stderr, " -np, --no-prints [%-7s] do not print anything other than the results\n", params.no_prints ? "true" : "false");
|
| 45 |
+
fprintf(stderr, "\n");
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
static char * requires_value_error(const std::string & arg) {
|
| 49 |
+
fprintf(stderr, "error: argument %s requires value\n", arg.c_str());
|
| 50 |
+
exit(0);
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
static bool vad_params_parse(int argc, char ** argv, cli_params & params) {
|
| 54 |
+
for (int i = 1; i < argc; i++) {
|
| 55 |
+
std::string arg = argv[i];
|
| 56 |
+
|
| 57 |
+
if (arg == "-h" || arg == "--help") {
|
| 58 |
+
vad_print_usage(argc, argv, params);
|
| 59 |
+
exit(0);
|
| 60 |
+
}
|
| 61 |
+
#define ARGV_NEXT (((i + 1) < argc) ? argv[++i] : requires_value_error(arg))
|
| 62 |
+
else if (arg == "-f" || arg == "--file") { params.fname_inp = ARGV_NEXT; }
|
| 63 |
+
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(ARGV_NEXT); }
|
| 64 |
+
else if (arg == "-ug" || arg == "--use-gpu") { params.use_gpu = true; }
|
| 65 |
+
else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = ARGV_NEXT; }
|
| 66 |
+
else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(ARGV_NEXT); }
|
| 67 |
+
else if (arg == "-vsd" || arg == "--vad-min-speech-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(ARGV_NEXT); }
|
| 68 |
+
else if (arg == "-vsd" || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(ARGV_NEXT); }
|
| 69 |
+
else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s") { params.vad_max_speech_duration_s = std::stof(ARGV_NEXT); }
|
| 70 |
+
else if (arg == "-vp" || arg == "--vad-speech-pad-ms") { params.vad_speech_pad_ms = std::stoi(ARGV_NEXT); }
|
| 71 |
+
else if (arg == "-vo" || arg == "--vad-samples-overlap") { params.vad_samples_overlap = std::stof(ARGV_NEXT); }
|
| 72 |
+
else if (arg == "-np" || arg == "--no-prints") { params.no_prints = true; }
|
| 73 |
+
else {
|
| 74 |
+
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
| 75 |
+
vad_print_usage(argc, argv, params);
|
| 76 |
+
exit(0);
|
| 77 |
+
}
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
return true;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
static void cb_log_disable(enum ggml_log_level , const char * , void * ) { }
|
| 84 |
+
|
| 85 |
+
int main(int argc, char ** argv) {
|
| 86 |
+
cli_params cli_params;
|
| 87 |
+
|
| 88 |
+
if (!vad_params_parse(argc, argv, cli_params)) {
|
| 89 |
+
vad_print_usage(argc, argv, cli_params);
|
| 90 |
+
return 1;
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
if (cli_params.no_prints) {
|
| 94 |
+
whisper_log_set(cb_log_disable, NULL);
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
// Load the input sample audio file.
|
| 98 |
+
std::vector<float> pcmf32;
|
| 99 |
+
std::vector<std::vector<float>> pcmf32s;
|
| 100 |
+
if (!read_audio_data(cli_params.fname_inp.c_str(), pcmf32, pcmf32s, false)) {
|
| 101 |
+
fprintf(stderr, "error: failed to read audio data from %s\n", cli_params.fname_inp.c_str());
|
| 102 |
+
return 2;
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
// Initialize the context which loads the VAD model.
|
| 106 |
+
struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params();
|
| 107 |
+
ctx_params.n_threads = cli_params.n_threads;
|
| 108 |
+
ctx_params.use_gpu = cli_params.use_gpu;
|
| 109 |
+
struct whisper_vad_context * vctx = whisper_vad_init_from_file_with_params(
|
| 110 |
+
cli_params.vad_model.c_str(),
|
| 111 |
+
ctx_params);
|
| 112 |
+
|
| 113 |
+
// Detect speech in the input audio file.
|
| 114 |
+
if (!whisper_vad_detect_speech(vctx, pcmf32.data(), pcmf32.size())) {
|
| 115 |
+
fprintf(stderr, "error: failed to detect speech\n");
|
| 116 |
+
return 3;
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
// Get the the vad segements using the probabilities that have been computed
|
| 120 |
+
// previously and stored in the whisper_vad_context.
|
| 121 |
+
struct whisper_vad_params params = whisper_vad_default_params();
|
| 122 |
+
params.threshold = cli_params.vad_threshold;
|
| 123 |
+
params.min_speech_duration_ms = cli_params.vad_min_speech_duration_ms;
|
| 124 |
+
params.min_silence_duration_ms = cli_params.vad_min_silence_duration_ms;
|
| 125 |
+
params.max_speech_duration_s = cli_params.vad_max_speech_duration_s;
|
| 126 |
+
params.speech_pad_ms = cli_params.vad_speech_pad_ms;
|
| 127 |
+
params.samples_overlap = cli_params.vad_samples_overlap;
|
| 128 |
+
struct whisper_vad_segments * segments = whisper_vad_segments_from_probs(vctx, params);
|
| 129 |
+
|
| 130 |
+
printf("\n");
|
| 131 |
+
printf("Detected %d speech segments:\n", whisper_vad_segments_n_segments(segments));
|
| 132 |
+
for (int i = 0; i < whisper_vad_segments_n_segments(segments); ++i) {
|
| 133 |
+
printf("Speech segment %d: start = %.2f, end = %.2f\n", i,
|
| 134 |
+
whisper_vad_segments_get_segment_t0(segments, i),
|
| 135 |
+
whisper_vad_segments_get_segment_t1(segments, i));
|
| 136 |
+
}
|
| 137 |
+
printf("\n");
|
| 138 |
+
|
| 139 |
+
whisper_vad_free_segments(segments);
|
| 140 |
+
whisper_vad_free(vctx);
|
| 141 |
+
|
| 142 |
+
return 0;
|
| 143 |
+
}
|