import React, { useState, useEffect, useRef } from 'react'; import { Mic, Square, Settings, Loader2, AlertCircle, Copy, CheckCircle2, ChevronDown, ChevronUp, Upload } from 'lucide-react'; // --- Feature Extraction: Log-Mel Spectrogram --- // This model requires 80-dim log-mel spectrogram features, standard for Conformer models. const computeLogMelSpectrogram = (audioData) => { const sr = 16000; const n_fft = 512; const win_length = 400; // 25ms const hop_length = 160; // 10ms const n_mels = 80; const preemph = 0.97; // 1. Preemphasis const preemphasized = new Float32Array(audioData.length); preemphasized[0] = audioData[0]; for (let i = 1; i < audioData.length; i++) { preemphasized[i] = audioData[i] - preemph * audioData[i - 1]; } // 2. Window (Hann) const window = new Float32Array(win_length); for (let i = 0; i < win_length; i++) { window[i] = 0.5 - 0.5 * Math.cos((2 * Math.PI * i) / (win_length - 1)); } // 3. Mel Filterbank const fmin = 0; const fmax = 8000; const melMin = 2595 * Math.log10(1 + fmin / 700); const melMax = 2595 * Math.log10(1 + fmax / 700); const melPoints = Array.from({length: n_mels + 2}, (_, i) => melMin + i * (melMax - melMin) / (n_mels + 1)); const hzPoints = melPoints.map(m => 700 * (Math.pow(10, m / 2595) - 1)); const fftFreqs = Array.from({length: n_fft / 2 + 1}, (_, i) => (i * sr) / n_fft); const fbank = []; for (let i = 0; i < n_mels; i++) { const row = new Float32Array(n_fft / 2 + 1); const f_left = hzPoints[i]; const f_center = hzPoints[i + 1]; const f_right = hzPoints[i + 2]; for (let j = 0; j < fftFreqs.length; j++) { const f = fftFreqs[j]; if (f >= f_left && f <= f_center) { row[j] = (f - f_left) / (f_center - f_left); } else if (f >= f_center && f <= f_right) { row[j] = (f_right - f) / (f_right - f_center); } } fbank.push(row); } // 4. STFT & Log-Mel Computation const numFrames = Math.floor((preemphasized.length - win_length) / hop_length) + 1; if (numFrames <= 0) return { melSpec: new Float32Array(0), numFrames: 0 }; const melSpec = new Float32Array(n_mels * numFrames); for (let frame = 0; frame < numFrames; frame++) { const start = frame * hop_length; const real = new Float32Array(n_fft); const imag = new Float32Array(n_fft); for (let i = 0; i < win_length; i++) { real[i] = preemphasized[start + i] * window[i]; } // Cooley-Tukey FFT let j = 0; for (let i = 0; i < n_fft - 1; i++) { if (i < j) { let tr = real[i]; real[i] = real[j]; real[j] = tr; let ti = imag[i]; imag[i] = imag[j]; imag[j] = ti; } let m = n_fft >> 1; while (m >= 1 && j >= m) { j -= m; m >>= 1; } j += m; } for (let l = 2; l <= n_fft; l <<= 1) { let l2 = l >> 1; let u1 = 1.0, u2 = 0.0; let c1 = Math.cos(Math.PI / l2), c2 = -Math.sin(Math.PI / l2); for (let j = 0; j < l2; j++) { for (let i = j; i < n_fft; i += l) { let i1 = i + l2; let t1 = u1 * real[i1] - u2 * imag[i1]; let t2 = u1 * imag[i1] + u2 * real[i1]; real[i1] = real[i] - t1; imag[i1] = imag[i] - t2; real[i] += t1; imag[i] += t2; } let z = u1 * c1 - u2 * c2; u2 = u1 * c2 + u2 * c1; u1 = z; } } // Apply Mel Filterbank & Log for (let m = 0; m < n_mels; m++) { let melEnergy = 0; for (let i = 0; i <= n_fft / 2; i++) { const power = real[i] * real[i] + imag[i] * imag[i]; melEnergy += power * fbank[m][i]; } const logMel = Math.log(Math.max(melEnergy, 1e-9)); melSpec[m * numFrames + frame] = logMel; } } // 5. Feature Standardization (per-instance mean/var normalization) for (let m = 0; m < n_mels; m++) { let sum = 0; for (let f = 0; f < numFrames; f++) { sum += melSpec[m * numFrames + f]; } const mean = sum / numFrames; let sumSq = 0; for (let f = 0; f < numFrames; f++) { const diff = melSpec[m * numFrames + f] - mean; sumSq += diff * diff; } const std = Math.sqrt(sumSq / numFrames) + 1e-9; for (let f = 0; f < numFrames; f++) { melSpec[m * numFrames + f] = (melSpec[m * numFrames + f] - mean) / std; } } return { melSpec, numFrames }; }; export default function App() { // App State const [modelUrl, setModelUrl] = useState("https://huggingface.co/sulabhkatiyar/indicconformer-120m-onnx/resolve/main/ml/model.onnx"); const [vocabUrl, setVocabUrl] = useState("https://huggingface.co/sulabhkatiyar/indicconformer-120m-onnx/resolve/main/ml/vocab.json"); const [isOrtReady, setIsOrtReady] = useState(false); const [session, setSession] = useState(null); const [vocab, setVocab] = useState([]); const [isLoading, setIsLoading] = useState(false); const [isRecording, setIsRecording] = useState(false); const [status, setStatus] = useState("Please load the model to begin."); const [transcript, setTranscript] = useState(""); const [copiedMessage, setCopiedMessage] = useState(""); const [showSettings, setShowSettings] = useState(false); const [errorMessage, setErrorMessage] = useState(""); // Refs for Audio Recording const mediaRecorderRef = useRef(null); const audioChunksRef = useRef([]); const fileInputRef = useRef(null); // Load onnxruntime-web script dynamically useEffect(() => { if (window.ort) { setIsOrtReady(true); return; } const script = document.createElement('script'); script.src = "https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/ort.min.js"; script.async = true; script.onload = () => setIsOrtReady(true); script.onerror = () => setErrorMessage("Failed to load onnxruntime-web library."); document.body.appendChild(script); }, []); const loadVocab = async (url) => { const res = await fetch(url); if (!res.ok) throw new Error(`Failed to load vocab from ${url}`); try { // First attempt to parse as JSON const data = await res.json(); if (Array.isArray(data)) { return data; // Simple array of tokens } else if (typeof data === 'object') { // Handle format {"token": index} const vocabArray = []; for (const [token, index] of Object.entries(data)) { vocabArray[index] = token; } return vocabArray; } } catch (e) { // Fallback to text-based parsing if JSON fails (e.g. for vocab.txt) const text = await res.text(); return text.split('\n').map(line => line.trim()).filter(line => line.length > 0); } throw new Error("Invalid vocabulary format"); }; const initModel = async () => { if (!isOrtReady || !window.ort) { setErrorMessage("ONNX Runtime is not ready yet."); return; } setIsLoading(true); setErrorMessage(""); setStatus("Downloading Vocabulary..."); try { const loadedVocab = await loadVocab(vocabUrl); setVocab(loadedVocab); setStatus("Downloading ONNX Model (100MB+). This may take a while..."); // Create Inference Session using the WASM execution provider const sess = await window.ort.InferenceSession.create(modelUrl, { executionProviders: ['wasm'] }); setSession(sess); setStatus("Model Loaded & Ready. Press the microphone to speak."); } catch (err) { console.error(err); setErrorMessage(`Initialization Error: ${err.message}. Please check the URLs in Settings.`); setStatus("Failed to load model."); } finally { setIsLoading(false); } }; const startRecording = async () => { try { const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); const mediaRecorder = new MediaRecorder(stream); audioChunksRef.current = []; mediaRecorder.ondataavailable = (e) => { if (e.data.size > 0) audioChunksRef.current.push(e.data); }; mediaRecorder.onstop = processAndInfer; mediaRecorderRef.current = mediaRecorder; mediaRecorder.start(); setIsRecording(true); setStatus("Recording... Speak in Malayalam."); setErrorMessage(""); } catch (err) { console.error(err); setErrorMessage("Microphone permission denied or an error occurred."); } }; const stopRecording = () => { if (mediaRecorderRef.current && isRecording) { mediaRecorderRef.current.stop(); setIsRecording(false); // Stops all microphone tracks mediaRecorderRef.current.stream.getTracks().forEach(track => track.stop()); } }; const processAndInfer = async () => { setStatus("Processing Audio..."); try { // Decode audio and resample to 16kHz Mono Float32 const blob = new Blob(audioChunksRef.current); const arrayBuffer = await blob.arrayBuffer(); const audioCtx = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 16000 }); const decodedData = await audioCtx.decodeAudioData(arrayBuffer); const float32Data = decodedData.getChannelData(0); // Mono channel setStatus("Running Inference..."); await runInference(float32Data); } catch (err) { console.error(err); setErrorMessage(`Audio Processing Error: ${err.message}`); setStatus("Ready."); } }; const handleFileUpload = async (e) => { const file = e.target.files[0]; if (!file) return; setStatus("Processing Uploaded Audio..."); setErrorMessage(""); setIsLoading(true); try { const arrayBuffer = await file.arrayBuffer(); const audioCtx = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 16000 }); const decodedData = await audioCtx.decodeAudioData(arrayBuffer); const float32Data = decodedData.getChannelData(0); // Mono channel setStatus("Running Inference on File..."); await runInference(float32Data); } catch (err) { console.error(err); setErrorMessage(`Audio Upload Error: ${err.message}`); setStatus("Ready."); } finally { setIsLoading(false); e.target.value = null; // Reset input to allow re-uploading the same file } }; const runInference = async (float32Data) => { try { const inputNames = session.inputNames; const feeds = {}; // Attempt 1: Raw Waveform tensor if (inputNames.includes('audio_signal')) { feeds['audio_signal'] = new window.ort.Tensor('float32', float32Data, [1, float32Data.length]); } else { throw new Error(`The model expects inputs: ${inputNames.join(', ')}.`); } if (inputNames.includes('length')) { feeds['length'] = new window.ort.Tensor('int64', new BigInt64Array([BigInt(float32Data.length)]), [1]); } let results; try { results = await session.run(feeds); } catch (runError) { // Attempt 2: Feature-extracted Log-Mel Spectrogram (Catches "Expected: 3" or "Expected: 80" errors) if (runError.message && (runError.message.includes("Expected: 3") || runError.message.includes("Expected: 80"))) { console.warn("Raw audio tensor failed. Model likely lacks a feature extractor. Computing 80-bin Log-Mel Spectrogram natively..."); const { melSpec, numFrames } = computeLogMelSpectrogram(float32Data); if (numFrames <= 0) throw new Error("Audio sample is too short to process."); feeds['audio_signal'] = new window.ort.Tensor('float32', melSpec, [1, 80, numFrames]); if (inputNames.includes('length')) { feeds['length'] = new window.ort.Tensor('int64', new BigInt64Array([BigInt(numFrames)]), [1]); } results = await session.run(feeds); } else { throw runError; // Unhandled error } } // Assume the first output contains the logprobs/logits const outputName = session.outputNames[0]; const outputTensor = results[outputName]; const logits = outputTensor.data; let dims = outputTensor.dims; // Standardize dims to [batch, time, vocab] if (dims.length === 2) dims = [1, dims[0], dims[1]]; const text = decodeCTC(logits, dims, vocab); setTranscript(prev => prev + (prev ? " " : "") + text); setStatus("Transcription Complete. Ready for next."); } catch (err) { console.error(err); setErrorMessage(`Inference Error: ${err.message}`); setStatus("Ready."); } }; const decodeCTC = (logits, dims, vocabList) => { const T = dims[1]; // Time frames const V = dims[2]; // Vocab size emitted by model let result = []; let prev_id = -1; // In typical NeMo models, the blank token is the last index const blankId = V - 1; for (let t = 0; t < T; t++) { let max_val = -Infinity; let max_id = -1; for (let v = 0; v < V; v++) { const val = logits[t * V + v]; if (val > max_val) { max_val = val; max_id = v; } } if (max_id !== prev_id && max_id !== blankId) { let token = ""; if (max_id < vocabList.length) { token = vocabList[max_id]; } // Ignore standard special tokens if (token && token !== '' && token !== '' && token !== '' && token !== '') { result.push(token); } } prev_id = max_id; } // Clean up SentencePiece artifacts (e.g., '_' or ' ') let decodedText = result.join(''); decodedText = decodedText.replace(/ /g, ' ').replace(/_/g, ' ').trim(); return decodedText.replace(/\s+/g, ' '); // Remove redundant spaces }; const handleCopy = () => { const textArea = document.createElement("textarea"); textArea.value = transcript; document.body.appendChild(textArea); textArea.select(); try { document.execCommand('copy'); setCopiedMessage("Copied to clipboard!"); setTimeout(() => setCopiedMessage(""), 2000); } catch (err) { setCopiedMessage("Failed to copy"); setTimeout(() => setCopiedMessage(""), 2000); } document.body.removeChild(textArea); }; return (
{/* Header */}

Malayalam Speech-to-Text

Powered by IndicConformer-120M & ONNX Runtime Web

{/* Main Interface Card */}
{/* Status Bar */}
{isLoading ? ( ) : session ? ( ) : ( )} {status}
{/* Settings Panel */} {showSettings && (

Model Configuration

setModelUrl(e.target.value)} className="w-full p-2.5 border border-neutral-300 dark:border-neutral-600 rounded-lg bg-white dark:bg-neutral-900 focus:ring-2 focus:ring-blue-500 focus:border-blue-500 outline-none transition-all" />
setVocabUrl(e.target.value)} className="w-full p-2.5 border border-neutral-300 dark:border-neutral-600 rounded-lg bg-white dark:bg-neutral-900 focus:ring-2 focus:ring-blue-500 focus:border-blue-500 outline-none transition-all" />
Re-initialize model after changing URLs.
)} {/* Action Area */}
{/* Error Message Display */} {errorMessage && (
{errorMessage}
)} {!session && !isLoading && !errorMessage && ( )} {/* Input Controls */}
{/* Microphone Button */} {/* Upload Button */}

{isRecording ? "Tap to Stop & Transcribe" : (session ? "Tap Mic to Record or Upload an Audio File" : "Model required to process audio")}

{/* Transcript Area */}

Transcript

{/* Copy Tools */}
{copiedMessage && {copiedMessage}}
{transcript || Transcription will appear here...}
); }