Whisper / app.py
rafat0421's picture
Update app.py
d1a43a0
import os
import gradio as gr
from transformers import pipeline
from pytube import YouTube
from datasets import Dataset, Audio
from moviepy.editor import AudioFileClip
import googletrans
from googletrans import Translator
pipe = pipeline(model="rafat0421/whisper-small-hi")
def download_from_youtube(url):
streams = YouTube(url).streams.filter(only_audio=True, file_extension='mp4') #Downloads the video from the given YouTube URL and returns the path to the audio file.
fpath = streams.first().download()
return fpath
def get_timestamp(seconds):
minutes = int(seconds / 60)
seconds = int(seconds % 60)
return f"{str(minutes).zfill(2)}:{str(seconds).zfill(2)}" #Creates %M:%S timestamp from seconds.
def create_segments(audio_fpath, seconds_max):
if not os.path.exists("segmented_audios"):
os.makedirs("segmented_audios")
sound = AudioFileClip(audio_fpath)
n_full_segments = int(sound.duration / 30)
len_last_segment = sound.duration % 30
max_segments = int(seconds_max / 30)
if n_full_segments > max_segments:
n_full_segments = max_segments
len_last_segment = 0
segment_paths = []
segment_start_times = []
segments_available = n_full_segments + 1
for i in range(min(segments_available, max_segments)):
start = i * 30
# Skip last segment if it is smaller than two seconds
is_last_segment = i == n_full_segments
if is_last_segment and not len_last_segment > 2:
continue
elif is_last_segment:
end = start + len_last_segment
else:
end = (i + 1) * 30
segment_path = os.path.join("segmented_audios", f"segment_{i}.wav")
segment = sound.subclip(start, end)
segment.write_audiofile(segment_path)
segment_paths.append(segment_path)
segment_start_times.append(start)
return segment_paths, segment_start_times
def get_translation(text):
#translator = Translator()
#result = translator.translate(text)
#result = translator.translate(text, src='fi', dest='fr')
#return result
return "Under Development..."
def transcribe(audio, url, seconds_max):
if url:
fpath = download_from_youtube(url)
segment_paths, segment_start_times = create_segments(fpath, seconds_max)
audio_dataset = Dataset.from_dict({"audio": segment_paths}).cast_column("audio", Audio(sampling_rate=16000))
pred = pipe(audio_dataset["audio"])
text = ""
n_segments = len(segment_start_times)
for i, (seconds, output) in enumerate(zip(segment_start_times, pred)):
text += f"[Segment {i+1}/{n_segments}, start time {get_timestamp(seconds)}]\n"
text += f"{output['text']}\n"
text += f"[Translation]\n{get_translation(output['text'])}\n\n"
return text
else:
text = pipe(audio)["text"]
return text
iface = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(source="microphone", type="filepath", label="Transcribe from Microphone"),
gr.Text(max_lines=1, placeholder="YouTube Link", label="Transcribe from YouTube URL"),
gr.Slider(minimum=30, maximum=600, value=30, step=30, label="Number of seconds to transcribe")
],
outputs="text",
title="Whisper: transcribe Swedish language audio to text",
description="Swedish Text Transcription using Transformers.",
)
iface.launch()