|
import gradio as gr |
|
import pixeltable as pxt |
|
from pixeltable.iterators import FrameIterator |
|
from datetime import datetime |
|
import PIL.Image |
|
from pixeltable.functions import openai, image |
|
import os |
|
import getpass |
|
import requests |
|
import tempfile |
|
import json |
|
import math |
|
from typing import Dict, Optional |
|
|
|
|
|
MAX_VIDEO_SIZE_MB = 35 |
|
MAX_FRAMES = 5 |
|
|
|
|
|
PROMPT_TEMPLATES = { |
|
"descriptive": { |
|
"name": "Descriptive Analysis", |
|
"system_prompt": """You are a video content analyzer. Please generate a short and concise compelling description |
|
that summarizes the overall action and content of this video sequence. Focus on describing |
|
the key events, changes, and movements you observe across all frames.""", |
|
"description": "Generates a clear, factual description of the video content" |
|
}, |
|
"cinematic": { |
|
"name": "Cinematic Analysis (Christopher Nolan style)", |
|
"system_prompt": """You are Christopher Nolan, the acclaimed filmmaker. Describe this visual sequence |
|
as one continuous, flowing narrative moment, as you would when discussing a pivotal |
|
scene from one of your films. Focus on psychological undercurrents, visual symbolism, |
|
and the deeper thematic implications of what unfolds.""", |
|
"description": "Analyzes the video from a filmmaker's perspective with artistic interpretation" |
|
}, |
|
"documentary": { |
|
"name": "Documentary Style (David Attenborough)", |
|
"system_prompt": """You are David Attenborough, the renowned naturalist and documentarian. Narrate this sequence |
|
with your characteristic blend of scientific insight and storytelling prowess. Focus on the |
|
compelling details that bring the subject matter to life, while maintaining your signature |
|
warm, authoritative tone.""", |
|
"description": "Creates a nature documentary style narration" |
|
}, |
|
"technical": { |
|
"name": "Technical Analysis", |
|
"system_prompt": """You are a technical video analyst. Break down this sequence with precise attention to |
|
technical details including movement patterns, visual composition, lighting conditions, |
|
and any notable technical aspects of the footage.""", |
|
"description": "Provides detailed technical analysis of the video" |
|
}, |
|
"labelling": { |
|
"name": "Labelling and Annotation", |
|
"system_prompt": """You are a high-precision video labeling system designed to replace human labelers. |
|
Analyze this sequence with extreme attention to detail, focusing on: |
|
1. Object identification and tracking |
|
2. Precise descriptions of movements and actions |
|
3. Spatial relationships between objects |
|
4. Changes in object positions and behaviors |
|
Your goal is to provide detailed, accurate annotations that could be used for |
|
training computer vision models or validating automated systems.""", |
|
"description": "Provides detailed object and action annotations for machine learning purposes" |
|
} |
|
} |
|
|
|
|
|
VOICE_OPTIONS = { |
|
"alloy": "Alloy (Balanced)", |
|
"echo": "Echo (Smooth)", |
|
"fable": "Fable (Expressive)", |
|
"onyx": "Onyx (Authoritative)", |
|
"nova": "Nova (Friendly)", |
|
"shimmer": "Shimmer (Warm)" |
|
} |
|
|
|
def process_video(video_file: gr.Video, api_key: str, prompt_template: str, voice_choice: str, progress: Optional[gr.Progress] = None) -> tuple[str, str]: |
|
"""Process video with given parameters. Creates new Pixeltable instance for each request.""" |
|
try: |
|
if not video_file or not api_key: |
|
return "Please provide both video file and API key.", None |
|
|
|
|
|
os.environ['OPENAI_API_KEY'] = api_key |
|
|
|
video_path = video_file.name if hasattr(video_file, 'name') else str(video_file) |
|
|
|
|
|
file_size = os.path.getsize(video_path) / (1024 * 1024) |
|
if file_size > MAX_VIDEO_SIZE_MB: |
|
return f"Error: Video file size ({file_size:.1f}MB) exceeds limit of {MAX_VIDEO_SIZE_MB}MB", None |
|
|
|
if progress: |
|
progress(0.1, desc="Initializing...") |
|
|
|
|
|
session_id = datetime.now().strftime('%Y%m%d_%H%M%S') |
|
dir_name = f'video_processor_{session_id}' |
|
|
|
|
|
pxt.drop_dir(dir_name, force=True) |
|
pxt.create_dir(dir_name) |
|
|
|
|
|
video_table = pxt.create_table( |
|
f'{dir_name}.videos', |
|
{ |
|
"video": pxt.VideoType(nullable=True), |
|
"timestamp": pxt.TimestampType(), |
|
} |
|
) |
|
|
|
|
|
frames_view = pxt.create_view( |
|
f'{dir_name}.frames', |
|
video_table, |
|
iterator=FrameIterator.create(video=video_table.video, fps=1) |
|
) |
|
|
|
frames_view['encoded_frame'] = image.b64_encode(frames_view.frame) |
|
|
|
if progress: |
|
progress(0.2, desc="Processing video...") |
|
|
|
|
|
video_table.insert([{ |
|
"video": video_path, |
|
"timestamp": datetime.now(), |
|
}]) |
|
|
|
if progress: |
|
progress(0.4, desc="Extracting frames...") |
|
|
|
|
|
frames = frames_view.select(frames_view.encoded_frame).collect() |
|
frame_list = [f["encoded_frame"] for f in frames] |
|
|
|
def select_representative_frames(frames: list, num_frames: int = MAX_FRAMES) -> list: |
|
total_frames = len(frames) |
|
if total_frames <= num_frames: |
|
return frames |
|
|
|
interval = total_frames / num_frames |
|
selected_indices = [math.floor(i * interval) for i in range(num_frames)] |
|
return [frames[i] for i in selected_indices] |
|
|
|
selected_frames = select_representative_frames(frame_list) |
|
|
|
if progress: |
|
progress(0.6, desc="Analyzing with GPT-4 Vision...") |
|
|
|
def create_frame_content(frames: list) -> list: |
|
content = [ |
|
{ |
|
"type": "text", |
|
"text": "This is a sequence of frames from a video. Please analyze the overall action and content across all frames:" |
|
} |
|
] |
|
|
|
for i, frame in enumerate(frames, 1): |
|
content.extend([ |
|
{ |
|
"type": "text", |
|
"text": f"Frame {i}:" |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": { |
|
"url": f"data:image/jpeg;base64,{frame}" |
|
} |
|
} |
|
]) |
|
|
|
return content |
|
|
|
|
|
frame_content = create_frame_content(selected_frames) |
|
template = PROMPT_TEMPLATES[prompt_template] |
|
|
|
messages = [ |
|
{ |
|
'role': 'system', |
|
'content': template["system_prompt"] |
|
}, |
|
{ |
|
'role': 'user', |
|
'content': frame_content |
|
} |
|
] |
|
|
|
video_table['response'] = openai.chat_completions( |
|
messages=messages, |
|
model='gpt-4o', |
|
max_tokens=500 |
|
) |
|
|
|
video_table['content'] = video_table.response.choices[0].message.content.astype(pxt.StringType()) |
|
|
|
if progress: |
|
progress(0.8, desc="Generating audio...") |
|
|
|
|
|
@pxt.udf |
|
def generate_voiceover(script: str, voice: str) -> str: |
|
try: |
|
response = requests.post( |
|
"https://api.openai.com/v1/audio/speech", |
|
headers={"Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"}, |
|
json={ |
|
"model": "tts-1", |
|
"input": script, |
|
"voice": voice, |
|
} |
|
) |
|
if response.status_code != 200: |
|
raise Exception(f"TTS API error: {response.status_code} - {response.text}") |
|
|
|
|
|
temp_dir = tempfile.gettempdir() |
|
temp_audio_path = os.path.join(temp_dir, f"voiceover_{session_id}.mp3") |
|
|
|
with open(temp_audio_path, 'wb') as f: |
|
f.write(response.content) |
|
|
|
return temp_audio_path |
|
except Exception as e: |
|
print(f"Error generating audio: {e}") |
|
return None |
|
|
|
|
|
video_table['audio_path'] = generate_voiceover(video_table.content, voice_choice) |
|
results = video_table.select( |
|
video_table.content, |
|
video_table.audio_path |
|
).tail(1) |
|
|
|
if progress: |
|
progress(1.0, desc="Processing complete!") |
|
|
|
|
|
try: |
|
pxt.drop_dir(dir_name, force=True) |
|
except Exception as e: |
|
print(f"Warning: Could not clean up directory {dir_name}: {e}") |
|
|
|
return ( |
|
results['content'][0], |
|
results['audio_path'][0] |
|
) |
|
|
|
except Exception as e: |
|
print(f"Error processing video: {e}") |
|
return f"Error processing video: {str(e)}", None |
|
|
|
|
|
def create_interface(): |
|
with gr.Blocks(theme=gr.themes.Base()) as demo: |
|
|
|
gr.Markdown( |
|
""" |
|
<div style="text-align: left; margin-bottom: 2rem;"> |
|
<img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png" alt="Pixeltable" style="max-width: 200px; margin-bottom: 1rem;" /> |
|
<h1>π₯ AI Video Analyzer: Custom GPT-4 Analysis & TTS Narration</h1> |
|
<p>Convert videos into rich narratives with 5 analysis styles - from Christopher Nolan-style cinematic breakdowns to David Attenborough documentary narrations.</p> |
|
</div> |
|
""" |
|
) |
|
|
|
|
|
gr.HTML( |
|
""" |
|
<div style="background-color: #FFF3CD; border: 1px solid #FF7D04; padding: 1rem; margin: 1rem 0; border-radius: 4px;"> |
|
<p style="margin: 0; color: #013056;"> |
|
β οΈ <strong>Notice:</strong> This application requires an OpenAI API key and uses the following services: |
|
<ul style="margin-top: 0.5rem;"> |
|
<li>GPT-4 Vision API for video analysis</li> |
|
<li>TTS API for audio generation</li> |
|
</ul> |
|
Please be aware of associated API costs. For pricing information, visit |
|
<a href="https://openai.com/pricing" target="_blank" style="color: #856404; text-decoration: underline;">OpenAI's pricing page</a>. |
|
<br><br> |
|
This application does not process audio/transcripts. If you need audio transcription and analysis, check out our |
|
<a href="https://huggingface.co/spaces/Pixeltable/Call-Analysis-AI-Tool" target="_blank" style="color: #856404; text-decoration: underline;"> |
|
Call Analysis AI Tool</a> which uses Whisper for audio processing. |
|
</p> |
|
</div> |
|
""" |
|
) |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
with gr.Accordion("What does it do?", open=True): |
|
gr.Markdown(""" |
|
- π₯ Analyze video content using GPT-4 Vision |
|
- π Generate detailed descriptions and narrations |
|
- π§ Create professional voiceovers using OpenAI's TTS |
|
- π Process up to 5 key frames from your video |
|
""") |
|
|
|
with gr.Column(): |
|
with gr.Accordion("How to use", open=True): |
|
gr.Markdown(""" |
|
1. Enter your OpenAI API key |
|
2. Upload a video file (max 35MB) |
|
3. Choose your preferred analysis style and voice |
|
5. Click "Process Video" and wait for results |
|
""") |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
api_key = gr.Textbox( |
|
label="OpenAI API Key", |
|
placeholder="sk-...", |
|
type="password" |
|
) |
|
|
|
|
|
video_input = gr.Video( |
|
label=f"Upload Video (max {MAX_VIDEO_SIZE_MB}MB)", |
|
interactive=True |
|
) |
|
|
|
process_btn = gr.Button("π¬ Process Video", variant="primary") |
|
|
|
gr.Markdown(""" |
|
<h4>Click one of the examples below to get started:</h4> |
|
""" |
|
) |
|
|
|
gr.Examples( |
|
examples=[["example1.mp4"], ["example2.mp4"]], |
|
inputs=[video_input] |
|
) |
|
|
|
|
|
with gr.Column(): |
|
|
|
prompt_template = gr.Dropdown( |
|
choices=list(PROMPT_TEMPLATES.keys()), |
|
value="descriptive", |
|
label="Analysis Style", |
|
info="Choose analysis style" |
|
) |
|
|
|
voice_choice = gr.Dropdown( |
|
choices=list(VOICE_OPTIONS.keys()), |
|
value="onyx", |
|
label="Voice Selection", |
|
info="Select the voice for your narration" |
|
) |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("π Analysis"): |
|
content_output = gr.Textbox( |
|
label="Generated Content", |
|
lines=10 |
|
) |
|
|
|
with gr.TabItem("π§ Audio"): |
|
audio_output = gr.Audio( |
|
label="Generated Voiceover", |
|
type="filepath" |
|
) |
|
|
|
|
|
gr.HTML( |
|
""" |
|
<div style="margin-top: 2rem; padding-top: 1rem; border-top: 1px solid #e5e7eb;"> |
|
<div style="display: flex; justify-content: space-between; align-items: center; flex-wrap: wrap; gap: 1rem;"> |
|
<div style="flex: 1;"> |
|
<h4 style="margin: 0; color: #374151;">π Built with Pixeltable</h4> |
|
<p style="margin: 0.5rem 0; color: #6b7280;"> |
|
Open Source AI infrastructure for intelligent applications |
|
</p> |
|
</div> |
|
<div style="flex: 1;"> |
|
<h4 style="margin: 0; color: #374151;">π Resources</h4> |
|
<div style="display: flex; gap: 1.5rem; margin-top: 0.5rem;"> |
|
<a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #4F46E5; text-decoration: none;"> |
|
GitHub |
|
</a> |
|
<a href="https://docs.pixeltable.com" target="_blank" style="color: #4F46E5; text-decoration: none;"> |
|
Documentation |
|
</a> |
|
</div> |
|
</div> |
|
</div> |
|
</div> |
|
""" |
|
) |
|
|
|
|
|
process_btn.click( |
|
fn=process_video, |
|
inputs=[video_input, api_key, prompt_template, voice_choice], |
|
outputs=[content_output, audio_output] |
|
) |
|
|
|
return demo |
|
|
|
if __name__ == "__main__": |
|
demo = create_interface() |
|
demo.launch() |