Spaces:
Running
Running
import gradio as gr | |
import yt_dlp | |
from dotenv import load_dotenv | |
import os | |
import google.generativeai as genai | |
import re | |
import torch | |
from transformers import pipeline | |
from transformers.pipelines.audio_utils import ffmpeg_read | |
import time | |
import spaces | |
load_dotenv() | |
default_gemini_api_key = os.getenv('gemini_api_key') | |
device = 0 if torch.cuda.is_available() else "cpu" | |
def load_pipeline(model_name): | |
return pipeline( | |
task="automatic-speech-recognition", | |
model=model_name, | |
chunk_length_s=30, | |
device=device, | |
) | |
def configure_genai(api_key, model_variant): | |
genai.configure(api_key=api_key) | |
return genai.GenerativeModel(model_variant) | |
def extract_youtube_id(youtube_url): | |
# Extract the YouTube video ID from various URL formats | |
youtube_id_match = re.search(r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', youtube_url) | |
if youtube_id_match: | |
return youtube_id_match.group(1) | |
return None | |
def download_youtube_audio(youtube_url, output_filename): | |
ydl_opts = { | |
'format': 'bestaudio/best', | |
'postprocessors': [{ | |
'key': 'FFmpegExtractAudio', | |
'preferredcodec': 'mp3', | |
'preferredquality': '192', | |
}], | |
'outtmpl': output_filename, | |
} | |
try: | |
with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
ydl.download([youtube_url]) | |
print(f"Downloaded audio from YouTube URL: {youtube_url}") | |
return output_filename | |
except Exception as e: | |
print(f"Error downloading YouTube audio: {str(e)}") | |
raise gr.Error(f"Failed to download YouTube audio: {str(e)}") | |
def summarize_transcription(transcription, model, gemini_prompt): | |
try: | |
prompt = f"{gemini_prompt}:\n\n{transcription}" | |
response = model.generate_content(prompt) | |
return response.text | |
except Exception as e: | |
print(f"Error summarizing transcription: {str(e)}") | |
return f"Error summarizing transcription: {str(e)}" | |
def process_audio(audio_file, language): | |
print("Starting transcription...") | |
with open(audio_file, "rb") as f: | |
inputs = f.read() | |
inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate) | |
inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate} | |
if device == 0: | |
pipe = load_pipeline(whisper_model) | |
else: | |
pipe = load_pipeline("openai/whisper-tiny") | |
if language: | |
print(f"Using language: {language}") | |
transcription = pipe(inputs, batch_size=8, generate_kwargs={"task": "transcribe", "language": language}, return_timestamps=True)["text"] | |
else: | |
print("No language defined, using default language") | |
transcription = pipe(inputs, batch_size=8, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"] | |
return transcription | |
def transcribe(youtube_url, audio_file, whisper_model, gemini_api_key, gemini_prompt, gemini_model_variant, language, progress=gr.Progress()): | |
start_time = time.time() | |
try: | |
progress(0, desc="Initializing") | |
if not gemini_api_key: | |
gemini_api_key = default_gemini_api_key | |
model = configure_genai(gemini_api_key, gemini_model_variant) | |
if youtube_url: | |
progress(0.1, desc="Extracting YouTube ID") | |
youtube_id = extract_youtube_id(youtube_url) | |
if youtube_id: | |
output_filename = f"{youtube_id}" | |
else: | |
output_filename = f"unknown" | |
progress(0.2, desc="Downloading YouTube audio") | |
audio_file = download_youtube_audio(youtube_url, output_filename) | |
audio_file = f"{audio_file}.mp3" | |
print(f"Audio file downloaded: {audio_file}") | |
else: | |
progress(0.2, desc="Reading audio file") | |
audio_file = f"{audio_file.name}" | |
print(f"Audio file read: {audio_file}") | |
progress(0.4, desc="Starting transcription") | |
transcription = process_audio(audio_file, language) | |
progress(0.6, desc="Cleaning up") | |
# Delete the audio file after transcription | |
if os.path.exists(f"{audio_file}.mp3"): | |
os.remove(f"{audio_file}.mp3") | |
print(f"Deleted audio file: {audio_file}.mp3") | |
progress(0.7, desc="Summarizing transcription") | |
# Summarize the transcription | |
summary = summarize_transcription(transcription, model, gemini_prompt) | |
progress(0.8, desc="Preparing output") | |
# Prepare the transcription and summary message | |
transcription_message = f"{transcription}" if transcription else "" | |
summary_message = f"{summary}" if summary else "" | |
progress(0.9, desc="Saving output to file") | |
print("Saving transcription and summary to file...") | |
# Save transcription and summary to separate text files | |
transcription_file = "transcription_output.txt" | |
summary_file = "summary_output.txt" | |
with open(transcription_file, "w", encoding="utf-8") as f: | |
f.write(transcription_message) | |
with open(summary_file, "w", encoding="utf-8") as f: | |
f.write(summary_message) | |
progress(1, desc="Complete") | |
print("Transcription and summarization complete.") | |
end_time = time.time() | |
total_time = round(end_time - start_time, 2) | |
return transcription_message, summary_message, transcription_file, summary_file, total_time | |
except gr.Error as e: | |
# Re-raise Gradio errors | |
raise e | |
except Exception as e: | |
print(f"Error during transcription or summarization: {str(e)}") | |
raise gr.Error(f"Transcription or summarization failed: {str(e)}") | |
def toggle_input(choice): | |
if choice == "YouTube URL": | |
return gr.update(visible=True), gr.update(visible=False, value=None) | |
else: | |
return gr.update(visible=False, value=None), gr.update(visible=True) | |
def toggle_language(choice): | |
if choice == True: | |
return gr.update(visible=True, value="id") | |
else: | |
return gr.update(visible=False, value="") | |
with gr.Blocks(theme='NoCrypt/miku') as demo: | |
gr.Label('Youtube Summarizer WebUI created with ❤️ by Ryusui', show_label=False) | |
with gr.Accordion("Input"): | |
with gr.Column(): | |
input_type = gr.Radio(["YouTube URL", "Audio File"], label="Input Type", value="Audio File", info="Please consider using the audio file if you face any issues with the YouTube URL. Currently youtube is banning HuggingFace IP Addresses.") | |
with gr.Row(): | |
youtube_url = gr.Textbox(label="YouTube URL", visible=False, info="Input the full URL of the YouTube video you want to transcribe and summarize. Example: https://www.youtube.com/watch?v=VIDEO_ID") | |
audio_file = gr.File(label="Upload Audio File", visible=True, file_types=['.wav', '.flac', '.mp3']) | |
whisper_model = gr.Dropdown(["openai/whisper-tiny", "openai/whisper-base", "openai/whisper-small", "openai/whisper-medium", "openai/whisper-large-v3", "distil-whisper/distil-large-v3"], label="Whisper Model", value="distil-whisper/distil-large-v3", info="Tiny is the fastest model, but it's not the best quality. large-v3 is the best quality, but it's the slowest model.") | |
gemini_model_variant = gr.Dropdown(["gemini-1.5-flash", "gemini-1.5-pro"], label="Gemini Model Variant", value="gemini-1.5-pro", info="Gemini-1.5-flash is the fastest model, but it's not the best quality. Gemini-1.5-pro is the best quality, but it's slower") | |
define_language = gr.Checkbox(label="Define Language", value=False, info="If you want to define the language, check this box") | |
language = gr.Dropdown(["id","en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "zh"], label="Language", value=None, info="Select the language for transcription", visible=False) | |
gemini_api_key = gr.Textbox(label="Gemini API Key (Optional)", placeholder="Enter your Gemini API key or leave blank to use default", info="If you facing error on transcription, please try to use your own API key") | |
gemini_prompt = gr.Textbox(label="Gemini Prompt", value="Buatkan resume dari transkrip ini") | |
transcribe_button = gr.Button("Transcribe and Summarize") | |
with gr.Accordion("Output"): | |
with gr.Column(): | |
transcription_output = gr.Textbox(label="Transcription Output") | |
summary_output = gr.Textbox(label="Summary Output") | |
transcription_file = gr.File(label="Download Transcription") | |
summary_file = gr.File(label="Download Summary") | |
processing_time = gr.Textbox(label="Total Processing Time (seconds)") | |
input_type.change(fn=toggle_input, inputs=input_type, outputs=[youtube_url, audio_file]) | |
define_language.change(fn=toggle_language, inputs=define_language, outputs=[language]) | |
transcribe_button.click( | |
fn=transcribe, | |
inputs=[ | |
youtube_url, | |
audio_file, | |
whisper_model, | |
gemini_api_key, | |
gemini_prompt, | |
gemini_model_variant, | |
language, | |
], | |
outputs=[transcription_output, summary_output, transcription_file, summary_file, processing_time] | |
) | |
print("Launching Gradio interface...") | |
demo.launch() |