pprobst ggerganov commited on
Commit
9b4d9d5
·
unverified ·
1 Parent(s): a90ae59

node : add audio_ctx and audio buffer params (#2123)

Browse files

* node : add audio_ctx param

* node : support passing audio buffer directly

* node : parse audio_ctx in index.js

---------

Co-authored-by: Georgi Gerganov <[email protected]>

examples/addon.node/__test__/whisper.spec.js CHANGED
@@ -16,6 +16,7 @@ const whisperParamsMock = {
16
  comma_in_time: false,
17
  translate: true,
18
  no_timestamps: false,
 
19
  };
20
 
21
  describe("Run whisper.node", () => {
 
16
  comma_in_time: false,
17
  translate: true,
18
  no_timestamps: false,
19
+ audio_ctx: 0,
20
  };
21
 
22
  describe("Run whisper.node", () => {
examples/addon.node/addon.cpp CHANGED
@@ -19,6 +19,7 @@ struct whisper_params {
19
  int32_t max_len = 0;
20
  int32_t best_of = 5;
21
  int32_t beam_size = -1;
 
22
 
23
  float word_thold = 0.01f;
24
  float entropy_thold = 2.4f;
@@ -46,6 +47,8 @@ struct whisper_params {
46
 
47
  std::vector<std::string> fname_inp = {};
48
  std::vector<std::string> fname_out = {};
 
 
49
  };
50
 
51
  struct whisper_print_user_data {
@@ -125,13 +128,12 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
125
  void cb_log_disable(enum ggml_log_level, const char *, void *) {}
126
 
127
  int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
128
-
129
  if (params.no_prints) {
130
  whisper_log_set(cb_log_disable, NULL);
131
  }
132
 
133
- if (params.fname_inp.empty()) {
134
- fprintf(stderr, "error: no input files specified\n");
135
  return 2;
136
  }
137
 
@@ -151,6 +153,14 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
151
  return 3;
152
  }
153
 
 
 
 
 
 
 
 
 
154
  for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
155
  const auto fname_inp = params.fname_inp[f];
156
  const auto fname_out = f < (int)params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
@@ -158,9 +168,14 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
158
  std::vector<float> pcmf32; // mono-channel F32 PCM
159
  std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
160
 
161
- if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
162
- fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
163
- continue;
 
 
 
 
 
164
  }
165
 
166
  // print system information
@@ -180,12 +195,13 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
180
  fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
181
  }
182
  }
183
- fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d ...\n",
184
  __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
185
  params.n_threads, params.n_processors,
186
  params.language.c_str(),
187
  params.translate ? "translate" : "transcribe",
188
- params.no_timestamps ? 0 : 1);
 
189
 
190
  fprintf(stderr, "\n");
191
  }
@@ -212,6 +228,7 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
212
  wparams.entropy_thold = params.entropy_thold;
213
  wparams.logprob_thold = params.logprob_thold;
214
  wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
 
215
 
216
  wparams.speed_up = params.speed_up;
217
 
@@ -311,14 +328,28 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
311
  bool use_gpu = whisper_params.Get("use_gpu").As<Napi::Boolean>();
312
  bool no_prints = whisper_params.Get("no_prints").As<Napi::Boolean>();
313
  bool no_timestamps = whisper_params.Get("no_timestamps").As<Napi::Boolean>();
 
314
  bool comma_in_time = whisper_params.Get("comma_in_time").As<Napi::Boolean>();
315
 
 
 
 
 
 
 
 
 
 
 
 
316
  params.language = language;
317
  params.model = model;
318
  params.fname_inp.emplace_back(input);
319
  params.use_gpu = use_gpu;
320
  params.no_prints = no_prints;
321
  params.no_timestamps = no_timestamps;
 
 
322
  params.comma_in_time = comma_in_time;
323
 
324
  Napi::Function callback = info[1].As<Napi::Function>();
 
19
  int32_t max_len = 0;
20
  int32_t best_of = 5;
21
  int32_t beam_size = -1;
22
+ int32_t audio_ctx = 0;
23
 
24
  float word_thold = 0.01f;
25
  float entropy_thold = 2.4f;
 
47
 
48
  std::vector<std::string> fname_inp = {};
49
  std::vector<std::string> fname_out = {};
50
+
51
+ std::vector<float> pcmf32 = {}; // mono-channel F32 PCM
52
  };
53
 
54
  struct whisper_print_user_data {
 
128
  void cb_log_disable(enum ggml_log_level, const char *, void *) {}
129
 
130
  int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
 
131
  if (params.no_prints) {
132
  whisper_log_set(cb_log_disable, NULL);
133
  }
134
 
135
+ if (params.fname_inp.empty() && params.pcmf32.empty()) {
136
+ fprintf(stderr, "error: no input files or audio buffer specified\n");
137
  return 2;
138
  }
139
 
 
153
  return 3;
154
  }
155
 
156
+ // if params.pcmf32 is provided, set params.fname_inp to "buffer"
157
+ // this is simpler than further modifications in the code
158
+ if (!params.pcmf32.empty()) {
159
+ fprintf(stderr, "info: using audio buffer as input\n");
160
+ params.fname_inp.clear();
161
+ params.fname_inp.emplace_back("buffer");
162
+ }
163
+
164
  for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
165
  const auto fname_inp = params.fname_inp[f];
166
  const auto fname_out = f < (int)params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
 
168
  std::vector<float> pcmf32; // mono-channel F32 PCM
169
  std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
170
 
171
+ // read the input audio file if params.pcmf32 is not provided
172
+ if (params.pcmf32.empty()) {
173
+ if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
174
+ fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
175
+ continue;
176
+ }
177
+ } else {
178
+ pcmf32 = params.pcmf32;
179
  }
180
 
181
  // print system information
 
195
  fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
196
  }
197
  }
198
+ fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d, audio_ctx = %d ...\n",
199
  __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
200
  params.n_threads, params.n_processors,
201
  params.language.c_str(),
202
  params.translate ? "translate" : "transcribe",
203
+ params.no_timestamps ? 0 : 1,
204
+ params.audio_ctx);
205
 
206
  fprintf(stderr, "\n");
207
  }
 
228
  wparams.entropy_thold = params.entropy_thold;
229
  wparams.logprob_thold = params.logprob_thold;
230
  wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
231
+ wparams.audio_ctx = params.audio_ctx;
232
 
233
  wparams.speed_up = params.speed_up;
234
 
 
328
  bool use_gpu = whisper_params.Get("use_gpu").As<Napi::Boolean>();
329
  bool no_prints = whisper_params.Get("no_prints").As<Napi::Boolean>();
330
  bool no_timestamps = whisper_params.Get("no_timestamps").As<Napi::Boolean>();
331
+ int32_t audio_ctx = whisper_params.Get("audio_ctx").As<Napi::Number>();
332
  bool comma_in_time = whisper_params.Get("comma_in_time").As<Napi::Boolean>();
333
 
334
+ Napi::Value pcmf32Value = whisper_params.Get("pcmf32");
335
+ std::vector<float> pcmf32_vec;
336
+ if (pcmf32Value.IsTypedArray()) {
337
+ Napi::Float32Array pcmf32 = pcmf32Value.As<Napi::Float32Array>();
338
+ size_t length = pcmf32.ElementLength();
339
+ pcmf32_vec.reserve(length);
340
+ for (size_t i = 0; i < length; i++) {
341
+ pcmf32_vec.push_back(pcmf32[i]);
342
+ }
343
+ }
344
+
345
  params.language = language;
346
  params.model = model;
347
  params.fname_inp.emplace_back(input);
348
  params.use_gpu = use_gpu;
349
  params.no_prints = no_prints;
350
  params.no_timestamps = no_timestamps;
351
+ params.audio_ctx = audio_ctx;
352
+ params.pcmf32 = pcmf32_vec;
353
  params.comma_in_time = comma_in_time;
354
 
355
  Napi::Function callback = info[1].As<Napi::Function>();
examples/addon.node/index.js CHANGED
@@ -16,13 +16,20 @@ const whisperParams = {
16
  comma_in_time: false,
17
  translate: true,
18
  no_timestamps: false,
 
19
  };
20
 
21
  const arguments = process.argv.slice(2);
22
  const params = Object.fromEntries(
23
  arguments.reduce((pre, item) => {
24
  if (item.startsWith("--")) {
25
- return [...pre, item.slice(2).split("=")];
 
 
 
 
 
 
26
  }
27
  return pre;
28
  }, [])
 
16
  comma_in_time: false,
17
  translate: true,
18
  no_timestamps: false,
19
+ audio_ctx: 0,
20
  };
21
 
22
  const arguments = process.argv.slice(2);
23
  const params = Object.fromEntries(
24
  arguments.reduce((pre, item) => {
25
  if (item.startsWith("--")) {
26
+ const [key, value] = item.slice(2).split("=");
27
+ if (key === "audio_ctx") {
28
+ whisperParams[key] = parseInt(value);
29
+ } else {
30
+ whisperParams[key] = value;
31
+ }
32
+ return pre;
33
  }
34
  return pre;
35
  }, [])