Spaces:
Running
Running
examples : add --print-confidence option to cli (#3150)
Browse files* examples : add --print-confidence option to cli
This commit adds a new command-line option `--print-confidence` to the
whisper-cli. When enabled, this option prints the confidence level of each
token in the transcribed text using ANSI formatting codes.
The confidence levels are represented using different styles:
```console
main: confidence: highlighted (low confidence), underlined (medium), dim (high confidence)
```
Refs: https://github.com/ggml-org/whisper.cpp/issues/3135
- examples/cli/cli.cpp +25 -0
- examples/common.h +20 -0
examples/cli/cli.cpp
CHANGED
|
@@ -70,6 +70,7 @@ struct whisper_params {
|
|
| 70 |
bool no_prints = false;
|
| 71 |
bool print_special = false;
|
| 72 |
bool print_colors = false;
|
|
|
|
| 73 |
bool print_progress = false;
|
| 74 |
bool no_timestamps = false;
|
| 75 |
bool log_score = false;
|
|
@@ -179,6 +180,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
|
|
| 179 |
else if (arg == "-np" || arg == "--no-prints") { params.no_prints = true; }
|
| 180 |
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
| 181 |
else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
|
|
|
|
| 182 |
else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; }
|
| 183 |
else if (arg == "-nt" || arg == "--no-timestamps") { params.no_timestamps = true; }
|
| 184 |
else if (arg == "-l" || arg == "--language") { params.language = whisper_param_turn_lowercase(ARGV_NEXT); }
|
|
@@ -257,6 +259,7 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
|
|
| 257 |
fprintf(stderr, " -np, --no-prints [%-7s] do not print anything other than the results\n", params.no_prints ? "true" : "false");
|
| 258 |
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
| 259 |
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
|
|
|
|
| 260 |
fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false");
|
| 261 |
fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "true" : "false");
|
| 262 |
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str());
|
|
@@ -386,6 +389,26 @@ static void whisper_print_segment_callback(struct whisper_context * ctx, struct
|
|
| 386 |
|
| 387 |
printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
|
| 388 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 389 |
} else {
|
| 390 |
const char * text = whisper_full_get_segment_text(ctx, i);
|
| 391 |
|
|
@@ -1115,6 +1138,8 @@ int main(int argc, char ** argv) {
|
|
| 1115 |
|
| 1116 |
if (params.print_colors) {
|
| 1117 |
fprintf(stderr, "%s: color scheme: red (low confidence), yellow (medium), green (high confidence)\n", __func__);
|
|
|
|
|
|
|
| 1118 |
}
|
| 1119 |
fprintf(stderr, "\n");
|
| 1120 |
}
|
|
|
|
| 70 |
bool no_prints = false;
|
| 71 |
bool print_special = false;
|
| 72 |
bool print_colors = false;
|
| 73 |
+
bool print_confidence= false;
|
| 74 |
bool print_progress = false;
|
| 75 |
bool no_timestamps = false;
|
| 76 |
bool log_score = false;
|
|
|
|
| 180 |
else if (arg == "-np" || arg == "--no-prints") { params.no_prints = true; }
|
| 181 |
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
| 182 |
else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
|
| 183 |
+
else if ( arg == "--print-confidence"){ params.print_confidence= true; }
|
| 184 |
else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; }
|
| 185 |
else if (arg == "-nt" || arg == "--no-timestamps") { params.no_timestamps = true; }
|
| 186 |
else if (arg == "-l" || arg == "--language") { params.language = whisper_param_turn_lowercase(ARGV_NEXT); }
|
|
|
|
| 259 |
fprintf(stderr, " -np, --no-prints [%-7s] do not print anything other than the results\n", params.no_prints ? "true" : "false");
|
| 260 |
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
| 261 |
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
|
| 262 |
+
fprintf(stderr, " --print-confidence [%-7s] print confidence\n", params.print_confidence ? "true" : "false");
|
| 263 |
fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false");
|
| 264 |
fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "true" : "false");
|
| 265 |
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str());
|
|
|
|
| 389 |
|
| 390 |
printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
|
| 391 |
}
|
| 392 |
+
} else if (params.print_confidence) {
|
| 393 |
+
for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
|
| 394 |
+
if (params.print_special == false) {
|
| 395 |
+
const whisper_token id = whisper_full_get_token_id(ctx, i, j);
|
| 396 |
+
if (id >= whisper_token_eot(ctx)) {
|
| 397 |
+
continue;
|
| 398 |
+
}
|
| 399 |
+
}
|
| 400 |
+
|
| 401 |
+
const char * text = whisper_full_get_token_text(ctx, i, j);
|
| 402 |
+
const float p = whisper_full_get_token_p (ctx, i, j);
|
| 403 |
+
|
| 404 |
+
int style_idx = 2; // High confidence - dim
|
| 405 |
+
if (p < 0.33) {
|
| 406 |
+
style_idx = 0; // Low confidence - inverse (highlighted)
|
| 407 |
+
} else if (p < 0.66) {
|
| 408 |
+
style_idx = 1; // Medium confidence - underlined
|
| 409 |
+
}
|
| 410 |
+
printf("%s%s%s%s", speaker.c_str(), k_styles[style_idx].c_str(), text, "\033[0m");
|
| 411 |
+
}
|
| 412 |
} else {
|
| 413 |
const char * text = whisper_full_get_segment_text(ctx, i);
|
| 414 |
|
|
|
|
| 1138 |
|
| 1139 |
if (params.print_colors) {
|
| 1140 |
fprintf(stderr, "%s: color scheme: red (low confidence), yellow (medium), green (high confidence)\n", __func__);
|
| 1141 |
+
} else if (params.print_confidence) {
|
| 1142 |
+
fprintf(stderr, "%s: confidence: highlighted (low confidence), underlined (medium), dim (high confidence)\n", __func__);
|
| 1143 |
}
|
| 1144 |
fprintf(stderr, "\n");
|
| 1145 |
}
|
examples/common.h
CHANGED
|
@@ -294,6 +294,26 @@ const std::vector<std::string> k_colors = {
|
|
| 294 |
set_xterm256_foreground( 78, 178, 101),
|
| 295 |
};
|
| 296 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
//
|
| 298 |
// Other utils
|
| 299 |
//
|
|
|
|
| 294 |
set_xterm256_foreground( 78, 178, 101),
|
| 295 |
};
|
| 296 |
|
| 297 |
+
// ANSI formatting codes
|
| 298 |
+
static std::string set_inverse() {
|
| 299 |
+
return "\033[7m";
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
static std::string set_underline() {
|
| 303 |
+
return "\033[4m";
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
static std::string set_dim() {
|
| 307 |
+
return "\033[2m";
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
// Style scheme for different confidence levels
|
| 311 |
+
const std::vector<std::string> k_styles = {
|
| 312 |
+
set_inverse(), // Low confidence - inverse (highlighted)
|
| 313 |
+
set_underline(), // Medium confidence - underlined
|
| 314 |
+
set_dim(), // High confidence - dim
|
| 315 |
+
};
|
| 316 |
+
|
| 317 |
//
|
| 318 |
// Other utils
|
| 319 |
//
|