Spaces:

awacke1
/

GPT-4o-omni-text-audio-image-video

Running on CPU Upgrade

App Files Files Community

awacke1 commited on May 14

Commit

19b3cb0

•

1 Parent(s): 1a9386d

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -39

app.py CHANGED Viewed

@@ -15,32 +15,24 @@ MODEL = "gpt-4o"
 def process_text():
     text_input = st.text_input("Enter your text:")
     if text_input:
-        completion = openai.ChatCompletion.create(
             model=MODEL,
-            messages=[
-                {"role": "system", "content": "You are a helpful assistant. Help me with my math homework!"},
-                {"role": "user", "content": f"Hello! Could you solve {text_input}?"}
-            ]
         )
-        st.write("Assistant: " + completion.choices[0].message["content"])
 def process_image(image_input):
     if image_input:
         base64_image = base64.b64encode(image_input.read()).decode("utf-8")
-        response = openai.ChatCompletion.create(
             model=MODEL,
-            messages=[
-                {"role": "system", "content": "You are a helpful assistant that responds in Markdown. Help me with my math homework!"},
-                {"role": "user", "content": [
-                    {"type": "text", "text": "What's the area of the triangle?"},
-                    {"type": "image_url", "image_url": {
-                        "url": f"data:image/png;base64,{base64_image}"}
-                    }
-                ]}
-            ],
-            temperature=0.0,
         )
-        st.markdown(response.choices[0].message["content"])
 def process_audio(audio_input):
     if audio_input:
@@ -48,17 +40,13 @@ def process_audio(audio_input):
             model="whisper-1",
             file=audio_input,
         )
-        response = openai.ChatCompletion.create(
             model=MODEL,
-            messages=[
-                {"role": "system", "content": "You are generating a transcript summary. Create a summary of the provided transcription. Respond in Markdown."},
-                {"role": "user", "content": [
-                    {"type": "text", "text": f"The audio transcription is: {transcription.text}"}
-                ]},
-            ],
-            temperature=0,
         )
-        st.markdown(response.choices[0].message["content"])
 def process_video(video_input):
     if video_input:
@@ -67,20 +55,14 @@ def process_video(video_input):
             model="whisper-1",
             file=open(audio_path, "rb"),
         )
-        response = openai.ChatCompletion.create(
             model=MODEL,
-            messages=[
-                {"role": "system", "content": "You are generating a video summary. Create a summary of the provided video and its transcript. Respond in Markdown"},
-                {"role": "user", "content": [
-                    "These are the frames from the video.",
-                    *map(lambda x: {"type": "image_url",
-                                    "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames),
-                    {"type": "text", "text": f"The audio transcription is: {transcription.text}"}
-                ]},
-            ],
-            temperature=0,
         )
-        st.markdown(response.choices[0].message["content"])
 def process_video_frames(video_path, seconds_per_frame=2):
     base64Frames = []

 def process_text():
     text_input = st.text_input("Enter your text:")
     if text_input:
+        response = openai.Completion.create(
             model=MODEL,
+            prompt=f"You are a helpful assistant. Help me with my math homework! {text_input}",
+            max_tokens=100,
+            temperature=0.5,
         )
+        st.write("Assistant: " + response.choices[0].text.strip())
 def process_image(image_input):
     if image_input:
         base64_image = base64.b64encode(image_input.read()).decode("utf-8")
+        response = openai.Completion.create(
             model=MODEL,
+            prompt=f"You are a helpful assistant that responds in Markdown. Help me with my math homework! What's the area of the triangle? [image: data:image/png;base64,{base64_image}]",
+            max_tokens=100,
+            temperature=0.5,
         )
+        st.markdown(response.choices[0].text.strip())
 def process_audio(audio_input):
     if audio_input:
             model="whisper-1",
             file=audio_input,
         )
+        response = openai.Completion.create(
             model=MODEL,
+            prompt=f"You are generating a transcript summary. Create a summary of the provided transcription. Respond in Markdown. The audio transcription is: {transcription['text']}",
+            max_tokens=100,
+            temperature=0.5,
         )
+        st.markdown(response.choices[0].text.strip())
 def process_video(video_input):
     if video_input:
             model="whisper-1",
             file=open(audio_path, "rb"),
         )
+        frames_text = " ".join([f"[image: data:image/jpg;base64,{frame}]" for frame in base64Frames])
+        response = openai.Completion.create(
             model=MODEL,
+            prompt=f"You are generating a video summary. Create a summary of the provided video and its transcript. Respond in Markdown. These are the frames from the video. {frames_text} The audio transcription is: {transcription['text']}",
+            max_tokens=500,
+            temperature=0.5,
         )
+        st.markdown(response.choices[0].text.strip())
 def process_video_frames(video_path, seconds_per_frame=2):
     base64Frames = []