danbev commited on
Commit
2d83266
·
unverified ·
1 Parent(s): a40b758

examples : add --print-confidence option to cli (#3150)

Browse files

* examples : add --print-confidence option to cli

This commit adds a new command-line option `--print-confidence` to the
whisper-cli. When enabled, this option prints the confidence level of each
token in the transcribed text using ANSI formatting codes.

The confidence levels are represented using different styles:
```console
main: confidence: highlighted (low confidence), underlined (medium), dim (high confidence)
```

Refs: https://github.com/ggml-org/whisper.cpp/issues/3135

Files changed (2) hide show
  1. examples/cli/cli.cpp +25 -0
  2. examples/common.h +20 -0
examples/cli/cli.cpp CHANGED
@@ -70,6 +70,7 @@ struct whisper_params {
70
  bool no_prints = false;
71
  bool print_special = false;
72
  bool print_colors = false;
 
73
  bool print_progress = false;
74
  bool no_timestamps = false;
75
  bool log_score = false;
@@ -179,6 +180,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
179
  else if (arg == "-np" || arg == "--no-prints") { params.no_prints = true; }
180
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
181
  else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
 
182
  else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; }
183
  else if (arg == "-nt" || arg == "--no-timestamps") { params.no_timestamps = true; }
184
  else if (arg == "-l" || arg == "--language") { params.language = whisper_param_turn_lowercase(ARGV_NEXT); }
@@ -257,6 +259,7 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
257
  fprintf(stderr, " -np, --no-prints [%-7s] do not print anything other than the results\n", params.no_prints ? "true" : "false");
258
  fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
259
  fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
 
260
  fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false");
261
  fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "true" : "false");
262
  fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str());
@@ -386,6 +389,26 @@ static void whisper_print_segment_callback(struct whisper_context * ctx, struct
386
 
387
  printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
388
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
  } else {
390
  const char * text = whisper_full_get_segment_text(ctx, i);
391
 
@@ -1115,6 +1138,8 @@ int main(int argc, char ** argv) {
1115
 
1116
  if (params.print_colors) {
1117
  fprintf(stderr, "%s: color scheme: red (low confidence), yellow (medium), green (high confidence)\n", __func__);
 
 
1118
  }
1119
  fprintf(stderr, "\n");
1120
  }
 
70
  bool no_prints = false;
71
  bool print_special = false;
72
  bool print_colors = false;
73
+ bool print_confidence= false;
74
  bool print_progress = false;
75
  bool no_timestamps = false;
76
  bool log_score = false;
 
180
  else if (arg == "-np" || arg == "--no-prints") { params.no_prints = true; }
181
  else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
182
  else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
183
+ else if ( arg == "--print-confidence"){ params.print_confidence= true; }
184
  else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; }
185
  else if (arg == "-nt" || arg == "--no-timestamps") { params.no_timestamps = true; }
186
  else if (arg == "-l" || arg == "--language") { params.language = whisper_param_turn_lowercase(ARGV_NEXT); }
 
259
  fprintf(stderr, " -np, --no-prints [%-7s] do not print anything other than the results\n", params.no_prints ? "true" : "false");
260
  fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
261
  fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
262
+ fprintf(stderr, " --print-confidence [%-7s] print confidence\n", params.print_confidence ? "true" : "false");
263
  fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false");
264
  fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "true" : "false");
265
  fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str());
 
389
 
390
  printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
391
  }
392
+ } else if (params.print_confidence) {
393
+ for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
394
+ if (params.print_special == false) {
395
+ const whisper_token id = whisper_full_get_token_id(ctx, i, j);
396
+ if (id >= whisper_token_eot(ctx)) {
397
+ continue;
398
+ }
399
+ }
400
+
401
+ const char * text = whisper_full_get_token_text(ctx, i, j);
402
+ const float p = whisper_full_get_token_p (ctx, i, j);
403
+
404
+ int style_idx = 2; // High confidence - dim
405
+ if (p < 0.33) {
406
+ style_idx = 0; // Low confidence - inverse (highlighted)
407
+ } else if (p < 0.66) {
408
+ style_idx = 1; // Medium confidence - underlined
409
+ }
410
+ printf("%s%s%s%s", speaker.c_str(), k_styles[style_idx].c_str(), text, "\033[0m");
411
+ }
412
  } else {
413
  const char * text = whisper_full_get_segment_text(ctx, i);
414
 
 
1138
 
1139
  if (params.print_colors) {
1140
  fprintf(stderr, "%s: color scheme: red (low confidence), yellow (medium), green (high confidence)\n", __func__);
1141
+ } else if (params.print_confidence) {
1142
+ fprintf(stderr, "%s: confidence: highlighted (low confidence), underlined (medium), dim (high confidence)\n", __func__);
1143
  }
1144
  fprintf(stderr, "\n");
1145
  }
examples/common.h CHANGED
@@ -294,6 +294,26 @@ const std::vector<std::string> k_colors = {
294
  set_xterm256_foreground( 78, 178, 101),
295
  };
296
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
  //
298
  // Other utils
299
  //
 
294
  set_xterm256_foreground( 78, 178, 101),
295
  };
296
 
297
+ // ANSI formatting codes
298
+ static std::string set_inverse() {
299
+ return "\033[7m";
300
+ }
301
+
302
+ static std::string set_underline() {
303
+ return "\033[4m";
304
+ }
305
+
306
+ static std::string set_dim() {
307
+ return "\033[2m";
308
+ }
309
+
310
+ // Style scheme for different confidence levels
311
+ const std::vector<std::string> k_styles = {
312
+ set_inverse(), // Low confidence - inverse (highlighted)
313
+ set_underline(), // Medium confidence - underlined
314
+ set_dim(), // High confidence - dim
315
+ };
316
+
317
  //
318
  // Other utils
319
  //