Spaces:

rafat0421
/

Whisper

Runtime error

App Files Files Community

Whisper / app.py

rafat0421

Update app.py

d1a43a0 almost 2 years ago

raw

history blame contribute delete

3.44 kB

	import os
	import gradio as gr
	from transformers import pipeline
	from pytube import YouTube
	from datasets import Dataset, Audio
	from moviepy.editor import AudioFileClip
	import googletrans
	from googletrans import Translator

	pipe = pipeline(model="rafat0421/whisper-small-hi")

	def download_from_youtube(url):
	streams = YouTube(url).streams.filter(only_audio=True, file_extension='mp4') #Downloads the video from the given YouTube URL and returns the path to the audio file.
	fpath = streams.first().download()
	return fpath

	def get_timestamp(seconds):
	minutes = int(seconds / 60)
	seconds = int(seconds % 60)
	return f"{str(minutes).zfill(2)}:{str(seconds).zfill(2)}" #Creates %M:%S timestamp from seconds.

	def create_segments(audio_fpath, seconds_max):
	if not os.path.exists("segmented_audios"):
	os.makedirs("segmented_audios")

	sound = AudioFileClip(audio_fpath)
	n_full_segments = int(sound.duration / 30)
	len_last_segment = sound.duration % 30

	max_segments = int(seconds_max / 30)
	if n_full_segments > max_segments:
	n_full_segments = max_segments
	len_last_segment = 0

	segment_paths = []
	segment_start_times = []

	segments_available = n_full_segments + 1
	for i in range(min(segments_available, max_segments)):
	start = i * 30

	# Skip last segment if it is smaller than two seconds
	is_last_segment = i == n_full_segments
	if is_last_segment and not len_last_segment > 2:
	continue
	elif is_last_segment:
	end = start + len_last_segment
	else:
	end = (i + 1) * 30

	segment_path = os.path.join("segmented_audios", f"segment_{i}.wav")
	segment = sound.subclip(start, end)
	segment.write_audiofile(segment_path)
	segment_paths.append(segment_path)
	segment_start_times.append(start)

	return segment_paths, segment_start_times

	def get_translation(text):
	#translator = Translator()
	#result = translator.translate(text)
	#result = translator.translate(text, src='fi', dest='fr')

	#return result
	return "Under Development..."

	def transcribe(audio, url, seconds_max):
	if url:
	fpath = download_from_youtube(url)
	segment_paths, segment_start_times = create_segments(fpath, seconds_max)

	audio_dataset = Dataset.from_dict({"audio": segment_paths}).cast_column("audio", Audio(sampling_rate=16000))
	pred = pipe(audio_dataset["audio"])
	text = ""
	n_segments = len(segment_start_times)
	for i, (seconds, output) in enumerate(zip(segment_start_times, pred)):
	text += f"[Segment {i+1}/{n_segments}, start time {get_timestamp(seconds)}]\n"
	text += f"{output['text']}\n"
	text += f"[Translation]\n{get_translation(output['text'])}\n\n"
	return text

	else:
	text = pipe(audio)["text"]
	return text

	iface = gr.Interface(
	fn=transcribe,
	inputs=[
	gr.Audio(source="microphone", type="filepath", label="Transcribe from Microphone"),
	gr.Text(max_lines=1, placeholder="YouTube Link", label="Transcribe from YouTube URL"),
	gr.Slider(minimum=30, maximum=600, value=30, step=30, label="Number of seconds to transcribe")
	],
	outputs="text",
	title="Whisper: transcribe Swedish language audio to text",
	description="Swedish Text Transcription using Transformers.",
	)

	iface.launch()