Spaces:

PlayHT
/

roast_your_pic

Running on CPU Upgrade

App Files Files Community

1littlecoder commited on 26 days ago

Commit

69617e8

•

1 Parent(s): 5f4d187

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -19

app.py CHANGED Viewed

@@ -1,20 +1,49 @@
 import gradio as gr
 import requests
-import json
-import os
-# Replace with your actual Play.ht API key and User ID
 API_KEY = os.getenv('PLAY_API_KEY')
 USER_ID = os.getenv('PLAY_USER_ID')
 def text_to_speech(text):
     url = "https://api.play.ht/api/v2/tts/stream"
-    # Customize the payload based on your Play.ht account setup
     payload = {
-        "voice": "s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json", # Replace with your desired voice
         "output_format": "mp3",
-        "text": text  # Text to be converted to speech
     }
     headers = {
         "accept": "audio/mpeg",
@@ -24,10 +53,7 @@ def text_to_speech(text):
     }
     response = requests.post(url, json=payload, headers=headers)
-    # Check if the response was successful
     if response.status_code == 200:
-        # Save the audio content to a file
         audio_path = "output_audio.mp3"
         with open(audio_path, "wb") as audio_file:
             audio_file.write(response.content)
@@ -35,13 +61,25 @@ def text_to_speech(text):
     else:
         return f"Error: {response.status_code} - {response.text}"
-# Set up Gradio Interface
-iface = gr.Interface(
-    fn=text_to_speech,
-    inputs="text",
-    outputs="audio",
-    title="Play.ht Text-to-Speech",
-    description="Convert text into speech using Play.ht's TTS streaming API."
-)
-iface.launch(debug = True)

+import os
+import google.generativeai as genai
 import gradio as gr
 import requests
+# Configure Google Gemini API
+genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
+# Play.ht API keys
 API_KEY = os.getenv('PLAY_API_KEY')
 USER_ID = os.getenv('PLAY_USER_ID')
+# Function to upload image to Gemini and get roasted text
+def upload_to_gemini(path, mime_type="image/jpeg"):
+    file = genai.upload_file(path, mime_type=mime_type)
+    return file
+def generate_roast(image_path):
+    # Upload the image to Gemini and get the text
+    uploaded_file = upload_to_gemini(image_path)
+    generation_config = {
+        "temperature": 1,
+        "top_p": 0.95,
+        "top_k": 40,
+        "max_output_tokens": 8192,
+        "response_mime_type": "text/plain",
+    }
+    model = genai.GenerativeModel(
+        model_name="gemini-1.5-flash-002",
+        generation_config=generation_config,
+        system_instruction="You are a professional satirist and fashion expert. You will be given a profile picture. Your duty is to roast whatever is given to you in the funniest way possible!",
+    )
+    chat_session = model.start_chat(
+        history=[{"role": "user", "parts": [uploaded_file]}]
+    )
+    response = chat_session.send_message("Roast this image!")
+    return response.text
+# Function to convert text to speech with Play.ht
 def text_to_speech(text):
     url = "https://api.play.ht/api/v2/tts/stream"
     payload = {
+        "voice": "s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json",
         "output_format": "mp3",
+        "text": text,
     }
     headers = {
         "accept": "audio/mpeg",
     }
     response = requests.post(url, json=payload, headers=headers)
     if response.status_code == 200:
         audio_path = "output_audio.mp3"
         with open(audio_path, "wb") as audio_file:
             audio_file.write(response.content)
     else:
         return f"Error: {response.status_code} - {response.text}"
+# Gradio Interface
+with gr.Blocks(theme={"primary_hue": "#b4fd83"}) as demo:
+    gr.Markdown("# Image to Text-to-Speech Roasting App")
+    gr.Markdown("Upload an image, and the AI will roast it and convert the roast to audio.")
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(type="filepath", label="Upload Image")
+        with gr.Column():
+            output_text = gr.Textbox(label="Roast Text")
+            audio_output = gr.Audio(label="Roast Audio")
+    def process_image(image):
+        roast_text = generate_roast(image)
+        audio_path = text_to_speech(roast_text)
+        return roast_text, audio_path
+    submit_button = gr.Button("Generate Roast")
+    submit_button.click(process_image, inputs=image_input, outputs=[output_text, audio_output])
+# Launch the app
+demo.launch(debug=True)