Spaces:

PlayHT
/

roast_your_pic

Running on CPU Upgrade

App Files Files Community

1littlecoder commited on 26 days ago

Commit

f16d803

•

1 Parent(s): 4d0dc05

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -33

app.py CHANGED Viewed

@@ -1,7 +1,15 @@
 import os
 import google.generativeai as genai
 import gradio as gr
 import requests
 # Configure Google Gemini API
 genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
@@ -10,20 +18,11 @@ genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
 API_KEY = os.getenv('PLAY_API_KEY')
 USER_ID = os.getenv('PLAY_USER_ID')
-# theme selection let's go with this before the branded color
-#theme={"primary_hue": "#b4fd83"}
-theme = gr.themes.Base(
-    primary_hue="emerald",
-)
-# Function to upload image to Gemini and get roasted text
 def upload_to_gemini(path, mime_type="image/jpeg"):
     file = genai.upload_file(path, mime_type=mime_type)
     return file
 def generate_roast(image_path):
-    # Upload the image to Gemini and get the text
     uploaded_file = upload_to_gemini(image_path)
     generation_config = {
         "temperature": 1,
@@ -35,16 +34,12 @@ def generate_roast(image_path):
     model = genai.GenerativeModel(
         model_name="gemini-1.5-flash-002",
         generation_config=generation_config,
-        system_instruction="You are a professional satirist and fashion expert. You will be given a profile picture. Your duty is to roast whatever is given to you in the funniest way possible!",
-    )
-    chat_session = model.start_chat(
-        history=[{"role": "user", "parts": [uploaded_file]}]
     )
     response = chat_session.send_message("Roast this image!")
     return response.text
-# Function to convert text to speech with Play.ht
 def text_to_speech(text):
     url = "https://api.play.ht/api/v2/tts/stream"
     payload = {
@@ -58,7 +53,6 @@ def text_to_speech(text):
         "Authorization": API_KEY,
         "X-User-ID": USER_ID
     }
     response = requests.post(url, json=payload, headers=headers)
     if response.status_code == 200:
         audio_path = "output_audio.mp3"
@@ -66,27 +60,31 @@ def text_to_speech(text):
             audio_file.write(response.content)
         return audio_path
     else:
-        return f"Error: {response.status_code} - {response.text}"
-# Gradio Interface
-with gr.Blocks(theme = theme) as demo:
-    gr.Markdown("# Image to Text-to-Speech Roasting App")
-    gr.Markdown("Upload an image, and the AI will roast it and convert the roast to audio.")
     with gr.Row():
-        with gr.Column():
-            image_input = gr.Image(type="filepath", label="Upload Image")
-        with gr.Column():
-            output_text = gr.Textbox(label="Roast Text")
-            audio_output = gr.Audio(label="Roast Audio")
-    def process_image(image):
-        roast_text = generate_roast(image)
-        audio_path = text_to_speech(roast_text)
-        return roast_text, audio_path
-    submit_button = gr.Button("Generate Roast")
-    submit_button.click(process_image, inputs=image_input, outputs=[output_text, audio_output])
 # Launch the app
 demo.launch(debug=True)

 import os
+import tempfile
+import shutil
 import google.generativeai as genai
 import gradio as gr
 import requests
+import numpy as np
+import subprocess
+import matplotlib.pyplot as plt
+from matplotlib.animation import FuncAnimation
+import PIL.Image
+from gradio import processing_utils, utils
 # Configure Google Gemini API
 genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
 API_KEY = os.getenv('PLAY_API_KEY')
 USER_ID = os.getenv('PLAY_USER_ID')
 def upload_to_gemini(path, mime_type="image/jpeg"):
     file = genai.upload_file(path, mime_type=mime_type)
     return file
 def generate_roast(image_path):
     uploaded_file = upload_to_gemini(image_path)
     generation_config = {
         "temperature": 1,
     model = genai.GenerativeModel(
         model_name="gemini-1.5-flash-002",
         generation_config=generation_config,
+        system_instruction="You are a professional satirist and fashion expert. Roast the profile picture.",
     )
+    chat_session = model.start_chat(history=[{"role": "user", "parts": [uploaded_file]}])
     response = chat_session.send_message("Roast this image!")
     return response.text
 def text_to_speech(text):
     url = "https://api.play.ht/api/v2/tts/stream"
     payload = {
         "Authorization": API_KEY,
         "X-User-ID": USER_ID
     }
     response = requests.post(url, json=payload, headers=headers)
     if response.status_code == 200:
         audio_path = "output_audio.mp3"
             audio_file.write(response.content)
         return audio_path
     else:
+        raise ValueError(f"Error: {response.status_code} - {response.text}")
+# Generate waveform and overlay with image
+def make_waveform_overlay(audio_path, image_path):
+    output_video_path = make_waveform(audio_path, bg_image=image_path, animate=True)
+    return output_video_path
+# Full Gradio Functionality
+def process_image(image):
+    roast_text = generate_roast(image)
+    audio_path = text_to_speech(roast_text)
+    final_video_path = make_waveform_overlay(audio_path, image)
+    return roast_text, final_video_path
+# Gradio Blocks UI
+with gr.Blocks() as demo:
+    gr.Markdown("# Image Roast and Waveform Video Generator")
     with gr.Row():
+        image_input = gr.Image(type="filepath", label="Upload Image")
+        output_text = gr.Textbox(label="Roast Text")
+        output_video = gr.Video(label="Roast Waveform Video")
+    submit_button = gr.Button("Generate Roast Video")
+    submit_button.click(process_image, inputs=image_input, outputs=[output_text, output_video])
 # Launch the app
 demo.launch(debug=True)