File size: 4,541 Bytes
f795870
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
const path = require("path");
const { whisper } = require(path.join(
  __dirname,
  "../../build/Release/addon.node"
));
const { promisify } = require("util");

const whisperAsync = promisify(whisper);

// Example with VAD enabled
const vadParams = {
  language: "en",
  model: path.join(__dirname, "../../models/ggml-base.en.bin"),
  fname_inp: path.join(__dirname, "../../samples/jfk.wav"),
  use_gpu: true,
  flash_attn: false,
  no_prints: false,
  comma_in_time: true,
  translate: false,
  no_timestamps: false,
  detect_language: false,
  audio_ctx: 0,
  max_len: 0,
  // VAD parameters
  vad: true,
  vad_model: path.join(__dirname, "../../models/ggml-silero-v5.1.2.bin"), // You need to download this model
  vad_threshold: 0.5,
  vad_min_speech_duration_ms: 250,
  vad_min_silence_duration_ms: 100,
  vad_max_speech_duration_s: 30.0,
  vad_speech_pad_ms: 30,
  vad_samples_overlap: 0.1,
  progress_callback: (progress) => {
    console.log(`VAD Transcription progress: ${progress}%`);
  }
};

// Example without VAD (traditional approach)
const traditionalParams = {
  language: "en",
  model: path.join(__dirname, "../../models/ggml-base.en.bin"),
  fname_inp: path.join(__dirname, "../../samples/jfk.wav"),
  use_gpu: true,
  flash_attn: false,
  no_prints: false,
  comma_in_time: true,
  translate: false,
  no_timestamps: false,
  detect_language: false,
  audio_ctx: 0,
  max_len: 0,
  vad: false, // Explicitly disable VAD
  progress_callback: (progress) => {
    console.log(`Traditional transcription progress: ${progress}%`);
  }
};

async function runVADExample() {
  try {
    console.log("=== Whisper.cpp Node.js VAD Example ===\n");
    
    // Check if VAD model exists
    const fs = require('fs');
    if (!fs.existsSync(vadParams.vad_model)) {
      console.log("⚠️  VAD model not found. Please download the VAD model first:");
      console.log("   ./models/download-vad-model.sh silero-v5.1.2");
      console.log("   Or run: python models/convert-silero-vad-to-ggml.py");
      console.log("\n   Falling back to traditional transcription without VAD...\n");
      
      // Run without VAD
      console.log("🎵 Running traditional transcription...");
      const traditionalResult = await whisperAsync(traditionalParams);
      console.log("\n📝 Traditional transcription result:");
      console.log(traditionalResult);
      return;
    }
    
    console.log("🎵 Running transcription with VAD enabled...");
    console.log("VAD Parameters:");
    console.log(`  - Threshold: ${vadParams.vad_threshold}`);
    console.log(`  - Min speech duration: ${vadParams.vad_min_speech_duration_ms}ms`);
    console.log(`  - Min silence duration: ${vadParams.vad_min_silence_duration_ms}ms`);
    console.log(`  - Max speech duration: ${vadParams.vad_max_speech_duration_s}s`);
    console.log(`  - Speech padding: ${vadParams.vad_speech_pad_ms}ms`);
    console.log(`  - Samples overlap: ${vadParams.vad_samples_overlap}\n`);
    
    const startTime = Date.now();
    const vadResult = await whisperAsync(vadParams);
    const vadDuration = Date.now() - startTime;
    
    console.log("\n✅ VAD transcription completed!");
    console.log(`⏱️  Processing time: ${vadDuration}ms`);
    console.log("\n📝 VAD transcription result:");
    console.log(vadResult);
    
    // Compare with traditional approach
    console.log("\n🔄 Running traditional transcription for comparison...");
    const traditionalStartTime = Date.now();
    const traditionalResult = await whisperAsync(traditionalParams);
    const traditionalDuration = Date.now() - traditionalStartTime;
    
    console.log("\n✅ Traditional transcription completed!");
    console.log(`⏱️  Processing time: ${traditionalDuration}ms`);
    console.log("\n📝 Traditional transcription result:");
    console.log(traditionalResult);
    
    // Performance comparison
    console.log("\n📊 Performance Comparison:");
    console.log(`VAD:         ${vadDuration}ms`);
    console.log(`Traditional: ${traditionalDuration}ms`);
    const speedup = traditionalDuration / vadDuration;
    if (speedup > 1) {
      console.log(`🚀 VAD is ${speedup.toFixed(2)}x faster!`);
    } else {
      console.log(`ℹ️  Traditional approach was ${(1/speedup).toFixed(2)}x faster in this case.`);
    }
    
  } catch (error) {
    console.error("❌ Error during transcription:", error);
  }
}

// Run the example
if (require.main === module) {
  runVADExample();
}

module.exports = {
  runVADExample,
  vadParams,
  traditionalParams
};