Spaces:
Sleeping
Sleeping
import gradio as gr | |
import whisper | |
from pytube import YouTube | |
import requests | |
headers = { | |
'accept': 'application/json', | |
'x-gladia-key': '89b0adf5-fb2c-48ba-8a66-76b02827fd14', | |
# requests won't add a boundary if this header is set when you pass files= | |
# 'Content-Type': 'multipart/form-data', | |
} | |
files = { | |
'audio': ("707539ca80d090a28c5ea7bbf93e8068.mp4", open('707539ca80d090a28c5ea7bbf93e8068.mp4', 'rb'), 'video/mp4'), | |
'audio_url': (None, 'http://files.gladia.io/example/audio-transcription/split_infinity.wav'), | |
'language': (None, 'english'), | |
'language_behaviour': (None, 'automatic single language'), | |
} | |
response = requests.post('https://api.gladia.io/audio/text/audio-transcription/', headers=headers, files=files) | |
def get_audio(url): | |
print(f'{url} start get audio ...') | |
yt = YouTube(url) | |
audio_file = yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4") | |
print('aodio over ..') | |
return audio_file | |
def get_transcript(url, model_size, lang, format): | |
audio_file = get_audio(url) | |
audio_file = 'tmp.mp4' | |
files = { | |
'audio': (f"{audio_file}", open(f'{audio_file}', 'rb'), 'video/mp4'), | |
'audio_url': (None, 'http://files.gladia.io/example/audio-transcription/split_infinity.wav'), | |
'language': (None, 'english'), | |
'language_behaviour': (None, 'automatic single language'), | |
} | |
response = requests.post('https://api.gladia.io/audio/text/audio-transcription/', headers=headers, files=files) | |
return response.text | |
def get_transcript2(url, model_size, lang, format): | |
print('whisper loading ...') | |
model = whisper.load_model(model_size) | |
print('whisper over') | |
if lang == "None": | |
lang = None | |
result = model.transcribe(get_audio(url), fp16=False, language=lang) | |
if format == "None": | |
return result["text"] | |
elif format == ".srt": | |
return format_to_srt(result["segments"]) | |
def format_to_srt(segments): | |
output = "" | |
for i, segment in enumerate(segments): | |
output += f"{i + 1}\n" | |
output += f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n" | |
output += f"{segment['text']}\n\n" | |
return output | |
def format_timestamp(t): | |
hh = t//3600 | |
mm = (t - hh*3600)//60 | |
ss = t - hh*3600 - mm*60 | |
mi = (t - int(t))*1000 | |
return f"{int(hh):02d}:{int(mm):02d}:{int(ss):02d},{int(mi):03d}" | |
langs = ["None"] + sorted(list(whisper.tokenizer.LANGUAGES.values())) | |
model_size = list(whisper._MODELS.keys()) | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Row(): | |
url = gr.Textbox(placeholder='Youtube video URL', label='URL') | |
with gr.Row(): | |
model_size = gr.Dropdown(choices=model_size, value='tiny', label="Model") | |
lang = gr.Dropdown(choices=langs, value="None", label="Language (Optional)") | |
format = gr.Dropdown(choices=["None", ".srt"], value="None", label="Timestamps? (Optional)") | |
with gr.Row(): | |
gr.Markdown("Larger models are more accurate, but slower. For 1min video, it'll take ~30s (tiny), ~1min (base), ~3min (small), ~5min (medium), etc.") | |
transcribe_btn = gr.Button('Transcribe') | |
with gr.Column(): | |
outputs = gr.Textbox(placeholder='Transcription of the video', label='Transcription') | |
transcribe_btn.click(get_transcript, inputs=[url, model_size, lang, format], outputs=outputs) | |
demo.launch(debug=True) | |