Spaces:

awacke1
/

GPT-4o-omni-text-audio-image-video

Running on CPU Upgrade

App Files Files Community

awacke1 commited on May 21

Commit

65224df

•

1 Parent(s): 0b0a63a

Create backup-app3-0521-app.py

Browse files

Files changed (1) hide show

backup-app3-0521-app.py +172 -0

backup-app3-0521-app.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import streamlit as st
+import openai
+from openai import OpenAI
+import os
+import base64
+import cv2
+from moviepy.editor import VideoFileClip
+# documentation
+# 1. Cookbook:  https://cookbook.openai.com/examples/gpt4o/introduction_to_gpt4o
+# 2. Configure your Project and Orgs to limit/allow Models:  https://platform.openai.com/settings/organization/general
+# 3. Watch your Billing!  https://platform.openai.com/settings/organization/billing/overview
+# Set API key and organization ID from environment variables
+openai.api_key = os.getenv('OPENAI_API_KEY')
+openai.organization = os.getenv('OPENAI_ORG_ID')
+client = OpenAI(api_key= os.getenv('OPENAI_API_KEY'), organization=os.getenv('OPENAI_ORG_ID'))
+# Define the model to be used
+#MODEL = "gpt-4o"
+MODEL = "gpt-4o-2024-05-13"
+def process_text():
+    text_input = st.text_input("Enter your text:")
+    if text_input:
+        completion = client.chat.completions.create(
+            model=MODEL,
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant. Help me with my math homework!"},
+                {"role": "user", "content": f"Hello! Could you solve {text_input}?"}
+            ]
+        )
+        st.write("Assistant: " + completion.choices[0].message.content)
+def process_image(image_input):
+    if image_input:
+        base64_image = base64.b64encode(image_input.read()).decode("utf-8")
+        response = client.chat.completions.create(
+            model=MODEL,
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant that responds in Markdown."},
+                {"role": "user", "content": [
+                    {"type": "text", "text": "Help me understand what is in this picture and list ten facts as markdown outline with appropriate emojis that describes what you see."},
+                    {"type": "image_url", "image_url": {
+                        "url": f"data:image/png;base64,{base64_image}"}
+                    }
+                ]}
+            ],
+            temperature=0.0,
+        )
+        st.markdown(response.choices[0].message.content)
+def process_audio(audio_input):
+    if audio_input:
+        transcription = client.audio.transcriptions.create(
+            model="whisper-1",
+            file=audio_input,
+        )
+        response = client.chat.completions.create(
+            model=MODEL,
+            messages=[
+            {"role": "system", "content":"""You are generating a transcript summary. Create a summary of the provided transcription. Respond in Markdown."""},
+            {"role": "user", "content": [{"type": "text", "text": f"The audio transcription is: {transcription.text}"}],}
+            ],
+            temperature=0,
+        )
+        st.markdown(response.choices[0].message.content)
+def process_audio_for_video(video_input):
+    if video_input:
+        transcription = client.audio.transcriptions.create(
+            model="whisper-1",
+            file=video_input,
+        )
+        response = client.chat.completions.create(
+            model=MODEL,
+            messages=[
+            {"role": "system", "content":"""You are generating a transcript summary. Create a summary of the provided transcription. Respond in Markdown."""},
+            {"role": "user", "content": [{"type": "text", "text": f"The audio transcription is: {transcription}"}],}
+            ],
+            temperature=0,
+        )
+        st.markdown(response.choices[0].message.content)
+        return response.choices[0].message.content
+def save_video(video_file):
+    # Save the uploaded video file
+    with open(video_file.name, "wb") as f:
+        f.write(video_file.getbuffer())
+    return video_file.name
+def process_video(video_path, seconds_per_frame=2):
+    base64Frames = []
+    base_video_path, _ = os.path.splitext(video_path)
+    video = cv2.VideoCapture(video_path)
+    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = video.get(cv2.CAP_PROP_FPS)
+    frames_to_skip = int(fps * seconds_per_frame)
+    curr_frame = 0
+    # Loop through the video and extract frames at specified sampling rate
+    while curr_frame < total_frames - 1:
+        video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
+        success, frame = video.read()
+        if not success:
+            break
+        _, buffer = cv2.imencode(".jpg", frame)
+        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
+        curr_frame += frames_to_skip
+    video.release()
+    # Extract audio from video
+    audio_path = f"{base_video_path}.mp3"
+    clip = VideoFileClip(video_path)
+    clip.audio.write_audiofile(audio_path, bitrate="32k")
+    clip.audio.close()
+    clip.close()
+    print(f"Extracted {len(base64Frames)} frames")
+    print(f"Extracted audio to {audio_path}")
+    return base64Frames, audio_path
+def process_audio_and_video(video_input):
+    if video_input is not None:
+        # Save the uploaded video file
+        video_path = save_video(video_input )
+        # Process the saved video
+        base64Frames, audio_path = process_video(video_path, seconds_per_frame=1)
+        # Get the transcript for the video model call
+        transcript = process_audio_for_video(video_input)
+        # Generate a summary with visual and audio
+        response = client.chat.completions.create(
+            model=MODEL,
+            messages=[
+                {"role": "system", "content": """You are generating a video summary. Create a summary of the provided video and its transcript. Respond in Markdown"""},
+                {"role": "user", "content": [
+                    "These are the frames from the video.",
+                    *map(lambda x: {"type": "image_url",
+                                    "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames),
+                    {"type": "text", "text": f"The audio transcription is: {transcript}"}
+                ]},
+            ],
+            temperature=0,
+        )
+        st.markdown(response.choices[0].message.content)
+def main():
+    st.markdown("### OpenAI GPT-4o Model")
+    st.markdown("#### The Omni Model with Text, Audio, Image, and Video")
+    option = st.selectbox("Select an option", ("Text", "Image", "Audio", "Video"))
+    if option == "Text":
+        process_text()
+    elif option == "Image":
+        image_input = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
+        process_image(image_input)
+    elif option == "Audio":
+        audio_input = st.file_uploader("Upload an audio file", type=["mp3", "wav"])
+        process_audio(audio_input)
+    elif option == "Video":
+        video_input = st.file_uploader("Upload a video file", type=["mp4"])
+        process_audio_and_video(video_input)
+if __name__ == "__main__":
+    main()