import os import google.generativeai as genai import gradio as gr import requests from moviepy.editor import AudioFileClip, ImageClip, CompositeVideoClip from PIL import Image # Configure Google Gemini API genai.configure(api_key=os.getenv("GEMINI_API_KEY")) # Play.ht API keys API_KEY = os.getenv('PLAY_API_KEY') USER_ID = os.getenv('PLAY_USER_ID') # Ensure compatibility with updated PIL library if not hasattr(Image, 'ANTIALIAS'): # Image.ANTIALIAS is deprecated; LANCZOS is the replacement Image.ANTIALIAS = Image.LANCZOS # Theme selection theme = gr.themes.Base( primary_hue="emerald", ) # Function to upload image to Gemini and get roasted text def upload_to_gemini(path, mime_type="image/jpeg"): file = genai.upload_file(path, mime_type=mime_type) return file def generate_roast(image_path): try: uploaded_file = upload_to_gemini(image_path) generation_config = { "temperature": 1, "top_p": 0.95, "top_k": 40, "max_output_tokens": 8192, "response_mime_type": "text/plain", } model = genai.GenerativeModel( model_name="gemini-1.5-flash-002", generation_config=generation_config, system_instruction=""" You are an AI assistant tasked with creating a satirical conversation between two female hosts gently roasting the uploaded picture. The conversation should feature the two hosts discussing the topic in a natural, conversational manner, with frequent backchanneling and interruptions to make it sound authentic. Keep the conversation between 100 to 150 words. Please abide by these guidelines: 1. Begin conversation turns with the prefix 'Host 1:' and 'Host 2:' For example, Host 1: Hi? Host 2: How areyou Host 1: I'm good. 2. Use humor, irony, and sarcasm to gently roast and entertain the person depicted in the image based on their appearance. 3. Your output should be a well-written text suitable for reading aloud. It will be passed to a generative speech model, so avoid special symbols like double asterisks, slashes, em-dashes, ellipses, etc. Also, avoid output that isn't dialogue. 4. Conversation turns should be concise and on-topic. 5. Ensure a natural flow of conversation, with hosts engaging with each other's ideas and bringing their own perspectives. 6. Include speech disfluencies and interruptions to make it sound authentic. 7. Incorporate frequent backchanneling throughout the conversation. """, ) chat_session = model.start_chat( history=[{"role": "user", "parts": [uploaded_file]}] ) response = chat_session.send_message("Roast this image!") return response.text except Exception as e: return f"Error generating roast: {e}" # Function to convert text to speech with Play.ht def text_to_speech(text): try: url = "https://api.play.ai/api/v1/tts/stream" payload = { "model": "PlayDialog", "voice": "s3://voice-cloning-zero-shot/adb83b67-8d75-48ff-ad4d-a0840d231ef1/original/manifest.json", "voice2": "s3://voice-cloning-zero-shot/831bd330-85c6-4333-b2b4-10c476ea3491/original/manifest.json", "turnPrefix": "Host 1:", "turnPrefix2": "Host 2:", 'prompt': None, 'prompt2': None, "output_format": "mp3", "text": text, } headers = { "content-type": "application/json", "Authorization": API_KEY, "X-User-ID": USER_ID } response = requests.post(url, json=payload, headers=headers) if response.status_code == 200: audio_path = "output_audio.mp3" with open(audio_path, "wb") as audio_file: audio_file.write(response.content) return audio_path else: return f"Error generating audio: {response.status_code} - {response.text}" except Exception as e: return f"Error generating audio: {e}" # Function to create video from image, audio, and add logo overlay def create_video(image, audio): try: # Load the audio file audio_clip = AudioFileClip(audio) # Load the main image and set its duration to match the audio image_clip = ImageClip(image).set_duration(audio_clip.duration) # Load the logo image, resize it, and position it in the top-right corner logo = ImageClip("PlayAI-Logo-RoastURL.png").resize(height=75) # Adjust the height as needed #logo = ImageClip("Logo.png").resize(height=75) # Adjust the height as needed logo = logo.margin(bottom=10, opacity=0).set_position(("center", "bottom")).set_duration(audio_clip.duration) # Create a composite video with the main image and the logo overlay video_clip = CompositeVideoClip([image_clip, logo]).set_audio(audio_clip) # Save the video to a temporary file output_path = "/tmp/output_video_with_logo.mp4" video_clip.write_videofile( output_path, fps=30, codec="libx264", audio_codec="aac", preset="slow", ffmpeg_params=["-b:v", "2000k"] # Adjust bitrate if needed ) return output_path except Exception as e: return f"Error generating video: {e}" # Function to process all steps at once def process_roast(image_path): roast_text = generate_roast(image_path) audio_path = text_to_speech(roast_text) video_path = create_video(image_path, audio_path) return roast_text, audio_path, video_path # Gradio Interface with gr.Blocks(theme=theme) as demo: gr.Markdown("# Get Roasted, Ready?") gr.Markdown("Upload an image, click 'Roast Image', and the AI will roast it") with gr.Row(): image_input = gr.Image(type="filepath", label="Upload Image") with gr.Column(): output_text = gr.Textbox(label="Roast Text") audio_output = gr.Audio(label="Roast Audio") video_output = gr.Video(label="Roast Video") # Single button to handle all actions roast_button = gr.Button("Roast Image") roast_button.click(process_roast, inputs=image_input, outputs=[output_text, audio_output, video_output]) gr.Examples( examples=[["TSwift.jpg"], ["GRamsay.jpg"],["therock.jpg"]], inputs=image_input ) # Launch the app demo.launch(debug=True)