import gradio as gr
from transformers import pipeline
from PIL import Image

image_class_pipe = pipeline(task="image-classification", model="google/vit-large-patch16-224")
video_class_pipe = pipeline(task="video-classification", model="nateraw/videomae-base-finetuned-ucf101-subset")
depth_estimator = pipeline(task="depth-estimation", model="Intel/dpt-large")
image_caption = pipeline("image-to-text",model="Salesforce/blip-image-captioning-base")

def classify_image_func(arr):
  img = Image.fromarray(arr)
  image_result = image_class_pipe(img)
  return image_result[0]["label"]

def classify_video_func(vid):
  video_result = video_class_pipe(vid)
  return video_result

def estimate_depth_func(arr):
  img = Image.fromarray(arr)
  depth_result = depth_estimator(img)
  return depth_result["depth"]

def blip_captioning_func(arr):
  img = Image.fromarray(arr)
  image_caption_result = image_caption(img, max_new_tokens=500)
  return image_caption_result[0]["generated_text"]

with gr.Blocks() as demo:
    gr.Markdown("# AI Methods")

    with gr.Tab("Media Classification"):
        gr.Markdown("# Image Classification")

        with gr.Row():
          classify_image_input = gr.Image(width=340, height=340)
        with gr.Row():
          classify_image_btn = gr.Button("Classify Image")
          classify_image_output = gr.Textbox(label="Result")

        classify_image_btn.click(fn=classify_image_func, inputs=[classify_image_input], outputs=[classify_image_output])

        gr.Markdown("# Video Classification")

        with gr.Row():
          classify_video_input = gr.Video(width=340, height=340)
        with gr.Row():
          classify_video_btn = gr.Button("Classify Video")
          classify_video_output = gr.Textbox(label="Result")

        classify_video_btn.click(fn=classify_video_func, inputs=[classify_video_input], outputs=[classify_video_output])

    with gr.Tab("Depth"):
        gr.Markdown("# Depth Estimation")

        with gr.Row():
          depth_estimation_input = gr.Image(width=260, height=260)
        with gr.Row():
          depth_estimation_btn = gr.Button("Estimate Depth")
        with gr.Row():
          depth_estimation_output = gr.Image()
        
        depth_estimation_btn.click(fn=estimate_depth_func, inputs=[depth_estimation_input], outputs=[depth_estimation_output])

    with gr.Tab("BLIP Captioning"):
        gr.Markdown("# BLIP Captioning")

        with gr.Row():
          blip_input = gr.Image(width=260, height=260)
        with gr.Row():
          blip_btn = gr.Button("BLIP Caption")
          blip_output = gr.Textbox(label="Caption")

        blip_btn.click(fn=blip_captioning_func, inputs=[blip_input], outputs=[blip_output])

demo.launch(debug=True)