| import os |
| import gradio as gr |
| import yt_dlp |
| from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq |
| import torch |
| import torchaudio |
| from pydub import AudioSegment |
| from pydub.silence import split_on_silence |
|
|
|
|
| def download_video(url): |
| """Download video and extract audio. |
| |
| :param url: The URL of the video to download. |
| :return: Path to the downloaded audio file. |
| """ |
| ydl_opts = { |
| 'format': 'bestaudio/best', |
| 'postprocessors': [{ |
| 'key': 'FFmpegExtractAudio', |
| 'preferredcodec': 'wav', |
| 'preferredquality': '192', |
| }], |
| 'outtmpl': f"downloaded_audio.%(ext)s", |
| } |
|
|
| with yt_dlp.YoutubeDL(ydl_opts) as ydl: |
| ydl.download([url]) |
|
|
| return f"downloaded_audio.wav" |
|
|
|
|
| def split_audio(audio_file, min_silence_len=500, silence_thresh=-40, keep_silence=200, max_length=30000) -> list: |
| """ |
| Splits the audio file into chunks at points of silence. |
| |
| :param audio_file: Path to the audio file. |
| :param min_silence_len: Minimum length of silence (in ms) to consider it as a split point. |
| :param silence_thresh: Silence threshold (in dB). |
| :param keep_silence: Amount of silence (in ms) to leave at the beginning and end of each chunk. |
| :param max_length: Maximum length of each chunk (in ms). |
| :return: List of paths to the audio chunks. |
| """ |
| |
| audio = AudioSegment.from_file(audio_file) |
|
|
| |
| chunks = split_on_silence( |
| audio, |
| min_silence_len=min_silence_len, |
| silence_thresh=silence_thresh, |
| keep_silence=keep_silence |
| ) |
|
|
| |
| split_chunks = [] |
| for i, chunk in enumerate(chunks): |
| if len(chunk) <= max_length: |
| split_chunks.append(chunk) |
| else: |
| split_chunks.extend(chunk[::max_length]) |
|
|
| |
| chunk_filenames = [] |
| for i, chunk in enumerate(split_chunks): |
| chunk_name = f"chunk{i}.wav" |
| chunk.export(chunk_name, format="wav") |
| chunk_filenames.append(chunk_name) |
|
|
| return chunk_filenames |
|
|
|
|
| |
| |
| processor = AutoProcessor.from_pretrained("GroupSix/whisper-small-sv") |
| model = AutoModelForSpeechSeq2Seq.from_pretrained("GroupSix/whisper-small-sv") |
|
|
|
|
| def transcribe_audio(segment, num_segments): |
| print(f"Current segment: {segment} (out of {num_segments})") |
|
|
| |
| waveform, sample_rate = torchaudio.load(segment) |
|
|
| |
| if sample_rate != 16000: |
| resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) |
| waveform = resampler(waveform) |
|
|
| |
| inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt") |
|
|
| |
| with torch.no_grad(): |
| generated_ids = model.generate(**inputs) |
|
|
| |
| decoded_output = processor.batch_decode(generated_ids, skip_special_tokens=True) |
| transcription = decoded_output[0] if decoded_output else "" |
|
|
| return transcription |
|
|
|
|
| |
| def process_video(url): |
| |
| audio_file = download_video(url) |
| segments = split_audio(audio_file) |
|
|
| |
| transcriptions = [transcribe_audio(segment, len(segments)) for segment in segments] |
|
|
| |
| os.remove(audio_file) |
| for segment in segments: |
| os.remove(segment) |
|
|
| return transcriptions |
|
|
|
|
| |
| iface = gr.Interface( |
| fn=process_video, |
| inputs=gr.Textbox(label="Swedish YouTube Video URL"), |
| outputs=gr.Textbox(label="Transcriptions"), |
| |
| |
| |
| |
| |
| |
| |
| ) |
|
|
| iface.launch() |
|
|