import gradio as gr
import os
import pytube
from pytube import YouTube
from pprint import pprint
from moviepy.editor import VideoFileClip
from transformers import pipeline
import librosa

# Load the Whisper model from Hugging Face
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base")

def download_video_mp4(youtube_url):
    try:
        # Create a youtube object
        yt = YouTube(youtube_url)

        # Get the video with the highest resolution and file size
        video = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()

        # Download the video to the current working directory
        video_filename = video.download()

        print('Video downloaded')
        return video_filename, ""
    except Exception as e:
        return "", str(e)

def create_audio_file(video_filename):
    try:
        # Use moviepy to extract the audio track from the video and create an .mp3 audio file
        audio_filename = video_filename.replace(".mp4", ".mp3")
        video = VideoFileClip(video_filename)
        audio = video.audio
        audio.write_audiofile(audio_filename)
        return audio_filename, ""
    except Exception as e:
        return "", str(e)

def transcribe(audio_path):
    try:
        # Load the audio file and convert it to a numpy array
        audio, _ = librosa.load(audio_path, sr=16000)
        
        # Transcribe the audio using the Whisper model
        transcript = transcriber(audio)
        return transcript["text"], ""
    except Exception as e:
        return "", str(e)

def process_youtube_url(youtube_url):
    video_filename, download_error = download_video_mp4(youtube_url)
    if not video_filename:
        return "", download_error

    audio_filename, audio_error = create_audio_file(video_filename)
    if not audio_filename:
        return "", audio_error

    yt_text, transcribe_error = transcribe(audio_filename)
    if not yt_text:
        return "", transcribe_error

    return yt_text, ""

iface = gr.Interface(
    fn=process_youtube_url, 
    inputs=gr.Textbox(label="YouTube URL"),
    outputs=[gr.Textbox(label="Transcription"), gr.Textbox(label="Errors")],
    title="YouTube Video Transcription",
    description="Enter a YouTube video URL to transcribe the audio using the Whisper model from Hugging Face."
)

iface.launch()