Spaces:

GroupSix
/

language

Sleeping

App Files Files Community

language / app.py

martenb

Updated app at sön 17 dec 2023 13:16:29 CET

be31921 over 2 years ago

raw

history blame contribute delete

4.34 kB

	import os
	import gradio as gr
	import yt_dlp
	from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
	import torch
	import torchaudio
	from pydub import AudioSegment
	from pydub.silence import split_on_silence


	def download_video(url):
	"""Download video and extract audio.

	:param url: The URL of the video to download.
	:return: Path to the downloaded audio file.
	"""
	ydl_opts = {
	'format': 'bestaudio/best',
	'postprocessors': [{
	'key': 'FFmpegExtractAudio',
	'preferredcodec': 'wav',
	'preferredquality': '192',
	}],
	'outtmpl': f"downloaded_audio.%(ext)s",
	}

	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	ydl.download([url])

	return f"downloaded_audio.wav"


	def split_audio(audio_file, min_silence_len=500, silence_thresh=-40, keep_silence=200, max_length=30000) -> list:
	"""
	Splits the audio file into chunks at points of silence.

	:param audio_file: Path to the audio file.
	:param min_silence_len: Minimum length of silence (in ms) to consider it as a split point.
	:param silence_thresh: Silence threshold (in dB).
	:param keep_silence: Amount of silence (in ms) to leave at the beginning and end of each chunk.
	:param max_length: Maximum length of each chunk (in ms).
	:return: List of paths to the audio chunks.
	"""
	# Load the audio file
	audio = AudioSegment.from_file(audio_file)

	# Split the audio file into chunks at points of silence
	chunks = split_on_silence(
	audio,
	min_silence_len=min_silence_len,
	silence_thresh=silence_thresh,
	keep_silence=keep_silence
	)

	# Further split chunks if they are too long
	split_chunks = []
	for i, chunk in enumerate(chunks):
	if len(chunk) <= max_length:
	split_chunks.append(chunk)
	else:
	split_chunks.extend(chunk[::max_length])

	# Export the chunks to files
	chunk_filenames = []
	for i, chunk in enumerate(split_chunks):
	chunk_name = f"chunk{i}.wav"
	chunk.export(chunk_name, format="wav")
	chunk_filenames.append(chunk_name)

	return chunk_filenames


	# Hugging Face
	# Load the model and processor
	processor = AutoProcessor.from_pretrained("GroupSix/whisper-small-sv")
	model = AutoModelForSpeechSeq2Seq.from_pretrained("GroupSix/whisper-small-sv")


	def transcribe_audio(segment, num_segments):
	print(f"Current segment: {segment} (out of {num_segments})")

	# Load the audio file
	waveform, sample_rate = torchaudio.load(segment)

	# Resample if necessary
	if sample_rate != 16000:
	resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
	waveform = resampler(waveform)

	# Run the model
	inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt")

	# Generate the transcription
	with torch.no_grad():
	generated_ids = model.generate(**inputs)

	# Decode the output and select the first transcription
	decoded_output = processor.batch_decode(generated_ids, skip_special_tokens=True)
	transcription = decoded_output[0] if decoded_output else ""

	return transcription


	# Main function to tie everything together
	def process_video(url):
	# Download and split the audio
	audio_file = download_video(url)
	segments = split_audio(audio_file)

	# Transcribe each segment
	transcriptions = [transcribe_audio(segment, len(segments)) for segment in segments]

	# Delete the audio file and the chunks
	os.remove(audio_file)
	for segment in segments:
	os.remove(segment)

	return transcriptions


	# Gradio interface
	iface = gr.Interface(
	fn=process_video,
	inputs=gr.Textbox(label="Swedish YouTube Video URL"),
	outputs=gr.Textbox(label="Transcriptions"),
	# examples=[
	# # ["https://www.youtube.com/watch?v=hcxwTgEC7IM"], # Fred på jorden
	# # ["https://www.youtube.com/watch?v=AzlipxrzMe4"], # Jerry talar spanska
	# # ["https://www.youtube.com/watch?v=H_16_5kGh3I"], # Det heter näsa, inte nos!
	# # ["https://www.youtube.com/watch?v=v2m4V6FUseQ"], # Ove blir arg på pantsystemet
	# # ["https://www.youtube.com/watch?v=oA5QJHBNQkU"], # Hur mår björnen egentligen? (takes too long)
	# ]
	)

	iface.launch()