Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import gradio as gr | |
import torch | |
from diffusers import StableDiffusion3Pipeline | |
from huggingface_hub import snapshot_download,login | |
from transformers import pipeline | |
from PIL import Image | |
import os | |
# Retrieve the API token from the environment variable | |
huggingface_token = os.getenv("HUGGINGFACE_TOKEN") | |
if huggingface_token is None: | |
raise ValueError("HUGGINGFACE_TOKEN environment variable is not set.") | |
# Log in to Hugging Face | |
login(token=huggingface_token) | |
# Check if CUDA is available | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
# Ensure GPU is available | |
if device == "cuda": | |
print("CUDA is available. Using GPU.") | |
else: | |
print("CUDA is not available. Using CPU.") | |
# Download and load the Stable Diffusion model | |
model_path = snapshot_download( | |
repo_id="stabilityai/stable-diffusion-3-medium", | |
revision="refs/pr/26", | |
repo_type="model", | |
ignore_patterns=["*.md", "*.gitattributes"], | |
local_dir="stable-diffusion-3-medium", | |
token=huggingface_token | |
) | |
image_gen = StableDiffusion3Pipeline.from_pretrained(model_path, text_encoder_3=None, tokenizer_3=None,torch_dtype=torch.float16) | |
image_gen = image_gen.to(device) | |
# Load the image-to-text pipeline | |
caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device) | |
def generate_image_from_caption(image, num_inference_steps=50, guidance_scale=7.5): | |
# Generate the caption | |
caption = caption_image(image)[0]['generated_text'] | |
print("Generated Caption:", caption) | |
# Generate the image from the caption | |
result = image_gen( | |
prompt=caption, | |
num_inference_steps=num_inference_steps, | |
guidance_scale=guidance_scale, | |
negative_prompt="blurred, ugly, watermark, low resolution, blurry", | |
height=512, | |
width=512 | |
) | |
# Get the generated image | |
generated_image = result.images[0] | |
return generated_image | |
# Create the Gradio interface | |
iface = gr.Interface( | |
fn=generate_image_from_caption, | |
inputs=[ | |
gr.Image(type="pil",label="Upload an image"), | |
gr.Slider(label="Number of inference steps", minimum=1, maximum=100, value=50), | |
gr.Slider(label="Guidance scale", minimum=1.0, maximum=20.0, value=7.5) | |
], | |
outputs=gr.Image(label="Generated Image"), | |
title="Image-to-Image Generator using Caption", | |
description="Upload an image to generate a caption, and then use the caption as a prompt to generate a new image using Stable Diffusion." | |
) | |
# Launch the Gradio app | |
iface.launch() | |