Spaces:
Running
Running
node : add audio_ctx and audio buffer params (#2123)
Browse files* node : add audio_ctx param
* node : support passing audio buffer directly
* node : parse audio_ctx in index.js
---------
Co-authored-by: Georgi Gerganov <[email protected]>
examples/addon.node/__test__/whisper.spec.js
CHANGED
|
@@ -16,6 +16,7 @@ const whisperParamsMock = {
|
|
| 16 |
comma_in_time: false,
|
| 17 |
translate: true,
|
| 18 |
no_timestamps: false,
|
|
|
|
| 19 |
};
|
| 20 |
|
| 21 |
describe("Run whisper.node", () => {
|
|
|
|
| 16 |
comma_in_time: false,
|
| 17 |
translate: true,
|
| 18 |
no_timestamps: false,
|
| 19 |
+
audio_ctx: 0,
|
| 20 |
};
|
| 21 |
|
| 22 |
describe("Run whisper.node", () => {
|
examples/addon.node/addon.cpp
CHANGED
|
@@ -19,6 +19,7 @@ struct whisper_params {
|
|
| 19 |
int32_t max_len = 0;
|
| 20 |
int32_t best_of = 5;
|
| 21 |
int32_t beam_size = -1;
|
|
|
|
| 22 |
|
| 23 |
float word_thold = 0.01f;
|
| 24 |
float entropy_thold = 2.4f;
|
|
@@ -46,6 +47,8 @@ struct whisper_params {
|
|
| 46 |
|
| 47 |
std::vector<std::string> fname_inp = {};
|
| 48 |
std::vector<std::string> fname_out = {};
|
|
|
|
|
|
|
| 49 |
};
|
| 50 |
|
| 51 |
struct whisper_print_user_data {
|
|
@@ -125,13 +128,12 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
|
|
| 125 |
void cb_log_disable(enum ggml_log_level, const char *, void *) {}
|
| 126 |
|
| 127 |
int run(whisper_params ¶ms, std::vector<std::vector<std::string>> &result) {
|
| 128 |
-
|
| 129 |
if (params.no_prints) {
|
| 130 |
whisper_log_set(cb_log_disable, NULL);
|
| 131 |
}
|
| 132 |
|
| 133 |
-
if (params.fname_inp.empty()) {
|
| 134 |
-
fprintf(stderr, "error: no input files specified\n");
|
| 135 |
return 2;
|
| 136 |
}
|
| 137 |
|
|
@@ -151,6 +153,14 @@ int run(whisper_params ¶ms, std::vector<std::vector<std::string>> &result) {
|
|
| 151 |
return 3;
|
| 152 |
}
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
|
| 155 |
const auto fname_inp = params.fname_inp[f];
|
| 156 |
const auto fname_out = f < (int)params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
|
|
@@ -158,9 +168,14 @@ int run(whisper_params ¶ms, std::vector<std::vector<std::string>> &result) {
|
|
| 158 |
std::vector<float> pcmf32; // mono-channel F32 PCM
|
| 159 |
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
|
| 160 |
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
}
|
| 165 |
|
| 166 |
// print system information
|
|
@@ -180,12 +195,13 @@ int run(whisper_params ¶ms, std::vector<std::vector<std::string>> &result) {
|
|
| 180 |
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
|
| 181 |
}
|
| 182 |
}
|
| 183 |
-
fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d ...\n",
|
| 184 |
__func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
|
| 185 |
params.n_threads, params.n_processors,
|
| 186 |
params.language.c_str(),
|
| 187 |
params.translate ? "translate" : "transcribe",
|
| 188 |
-
params.no_timestamps ? 0 : 1
|
|
|
|
| 189 |
|
| 190 |
fprintf(stderr, "\n");
|
| 191 |
}
|
|
@@ -212,6 +228,7 @@ int run(whisper_params ¶ms, std::vector<std::vector<std::string>> &result) {
|
|
| 212 |
wparams.entropy_thold = params.entropy_thold;
|
| 213 |
wparams.logprob_thold = params.logprob_thold;
|
| 214 |
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
|
|
|
|
| 215 |
|
| 216 |
wparams.speed_up = params.speed_up;
|
| 217 |
|
|
@@ -311,14 +328,28 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
|
|
| 311 |
bool use_gpu = whisper_params.Get("use_gpu").As<Napi::Boolean>();
|
| 312 |
bool no_prints = whisper_params.Get("no_prints").As<Napi::Boolean>();
|
| 313 |
bool no_timestamps = whisper_params.Get("no_timestamps").As<Napi::Boolean>();
|
|
|
|
| 314 |
bool comma_in_time = whisper_params.Get("comma_in_time").As<Napi::Boolean>();
|
| 315 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
params.language = language;
|
| 317 |
params.model = model;
|
| 318 |
params.fname_inp.emplace_back(input);
|
| 319 |
params.use_gpu = use_gpu;
|
| 320 |
params.no_prints = no_prints;
|
| 321 |
params.no_timestamps = no_timestamps;
|
|
|
|
|
|
|
| 322 |
params.comma_in_time = comma_in_time;
|
| 323 |
|
| 324 |
Napi::Function callback = info[1].As<Napi::Function>();
|
|
|
|
| 19 |
int32_t max_len = 0;
|
| 20 |
int32_t best_of = 5;
|
| 21 |
int32_t beam_size = -1;
|
| 22 |
+
int32_t audio_ctx = 0;
|
| 23 |
|
| 24 |
float word_thold = 0.01f;
|
| 25 |
float entropy_thold = 2.4f;
|
|
|
|
| 47 |
|
| 48 |
std::vector<std::string> fname_inp = {};
|
| 49 |
std::vector<std::string> fname_out = {};
|
| 50 |
+
|
| 51 |
+
std::vector<float> pcmf32 = {}; // mono-channel F32 PCM
|
| 52 |
};
|
| 53 |
|
| 54 |
struct whisper_print_user_data {
|
|
|
|
| 128 |
void cb_log_disable(enum ggml_log_level, const char *, void *) {}
|
| 129 |
|
| 130 |
int run(whisper_params ¶ms, std::vector<std::vector<std::string>> &result) {
|
|
|
|
| 131 |
if (params.no_prints) {
|
| 132 |
whisper_log_set(cb_log_disable, NULL);
|
| 133 |
}
|
| 134 |
|
| 135 |
+
if (params.fname_inp.empty() && params.pcmf32.empty()) {
|
| 136 |
+
fprintf(stderr, "error: no input files or audio buffer specified\n");
|
| 137 |
return 2;
|
| 138 |
}
|
| 139 |
|
|
|
|
| 153 |
return 3;
|
| 154 |
}
|
| 155 |
|
| 156 |
+
// if params.pcmf32 is provided, set params.fname_inp to "buffer"
|
| 157 |
+
// this is simpler than further modifications in the code
|
| 158 |
+
if (!params.pcmf32.empty()) {
|
| 159 |
+
fprintf(stderr, "info: using audio buffer as input\n");
|
| 160 |
+
params.fname_inp.clear();
|
| 161 |
+
params.fname_inp.emplace_back("buffer");
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
|
| 165 |
const auto fname_inp = params.fname_inp[f];
|
| 166 |
const auto fname_out = f < (int)params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
|
|
|
|
| 168 |
std::vector<float> pcmf32; // mono-channel F32 PCM
|
| 169 |
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
|
| 170 |
|
| 171 |
+
// read the input audio file if params.pcmf32 is not provided
|
| 172 |
+
if (params.pcmf32.empty()) {
|
| 173 |
+
if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
|
| 174 |
+
fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
|
| 175 |
+
continue;
|
| 176 |
+
}
|
| 177 |
+
} else {
|
| 178 |
+
pcmf32 = params.pcmf32;
|
| 179 |
}
|
| 180 |
|
| 181 |
// print system information
|
|
|
|
| 195 |
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
|
| 196 |
}
|
| 197 |
}
|
| 198 |
+
fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d, audio_ctx = %d ...\n",
|
| 199 |
__func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
|
| 200 |
params.n_threads, params.n_processors,
|
| 201 |
params.language.c_str(),
|
| 202 |
params.translate ? "translate" : "transcribe",
|
| 203 |
+
params.no_timestamps ? 0 : 1,
|
| 204 |
+
params.audio_ctx);
|
| 205 |
|
| 206 |
fprintf(stderr, "\n");
|
| 207 |
}
|
|
|
|
| 228 |
wparams.entropy_thold = params.entropy_thold;
|
| 229 |
wparams.logprob_thold = params.logprob_thold;
|
| 230 |
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
|
| 231 |
+
wparams.audio_ctx = params.audio_ctx;
|
| 232 |
|
| 233 |
wparams.speed_up = params.speed_up;
|
| 234 |
|
|
|
|
| 328 |
bool use_gpu = whisper_params.Get("use_gpu").As<Napi::Boolean>();
|
| 329 |
bool no_prints = whisper_params.Get("no_prints").As<Napi::Boolean>();
|
| 330 |
bool no_timestamps = whisper_params.Get("no_timestamps").As<Napi::Boolean>();
|
| 331 |
+
int32_t audio_ctx = whisper_params.Get("audio_ctx").As<Napi::Number>();
|
| 332 |
bool comma_in_time = whisper_params.Get("comma_in_time").As<Napi::Boolean>();
|
| 333 |
|
| 334 |
+
Napi::Value pcmf32Value = whisper_params.Get("pcmf32");
|
| 335 |
+
std::vector<float> pcmf32_vec;
|
| 336 |
+
if (pcmf32Value.IsTypedArray()) {
|
| 337 |
+
Napi::Float32Array pcmf32 = pcmf32Value.As<Napi::Float32Array>();
|
| 338 |
+
size_t length = pcmf32.ElementLength();
|
| 339 |
+
pcmf32_vec.reserve(length);
|
| 340 |
+
for (size_t i = 0; i < length; i++) {
|
| 341 |
+
pcmf32_vec.push_back(pcmf32[i]);
|
| 342 |
+
}
|
| 343 |
+
}
|
| 344 |
+
|
| 345 |
params.language = language;
|
| 346 |
params.model = model;
|
| 347 |
params.fname_inp.emplace_back(input);
|
| 348 |
params.use_gpu = use_gpu;
|
| 349 |
params.no_prints = no_prints;
|
| 350 |
params.no_timestamps = no_timestamps;
|
| 351 |
+
params.audio_ctx = audio_ctx;
|
| 352 |
+
params.pcmf32 = pcmf32_vec;
|
| 353 |
params.comma_in_time = comma_in_time;
|
| 354 |
|
| 355 |
Napi::Function callback = info[1].As<Napi::Function>();
|
examples/addon.node/index.js
CHANGED
|
@@ -16,13 +16,20 @@ const whisperParams = {
|
|
| 16 |
comma_in_time: false,
|
| 17 |
translate: true,
|
| 18 |
no_timestamps: false,
|
|
|
|
| 19 |
};
|
| 20 |
|
| 21 |
const arguments = process.argv.slice(2);
|
| 22 |
const params = Object.fromEntries(
|
| 23 |
arguments.reduce((pre, item) => {
|
| 24 |
if (item.startsWith("--")) {
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
}
|
| 27 |
return pre;
|
| 28 |
}, [])
|
|
|
|
| 16 |
comma_in_time: false,
|
| 17 |
translate: true,
|
| 18 |
no_timestamps: false,
|
| 19 |
+
audio_ctx: 0,
|
| 20 |
};
|
| 21 |
|
| 22 |
const arguments = process.argv.slice(2);
|
| 23 |
const params = Object.fromEntries(
|
| 24 |
arguments.reduce((pre, item) => {
|
| 25 |
if (item.startsWith("--")) {
|
| 26 |
+
const [key, value] = item.slice(2).split("=");
|
| 27 |
+
if (key === "audio_ctx") {
|
| 28 |
+
whisperParams[key] = parseInt(value);
|
| 29 |
+
} else {
|
| 30 |
+
whisperParams[key] = value;
|
| 31 |
+
}
|
| 32 |
+
return pre;
|
| 33 |
}
|
| 34 |
return pre;
|
| 35 |
}, [])
|