import spaces import gradio as gr import torch from PIL import Image from transformers import AutoProcessor, AutoModelForCausalLM, pipeline from diffusers import DiffusionPipeline import random import numpy as np import os import subprocess from huggingface_hub import hf_hub_download from llm_inference import LLMInferenceNode # Install flash-attn subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) # Initialize models device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32 huggingface_token = os.getenv("HUGGINGFACE_TOKEN") # SD3.5 model pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-3.5-large", torch_dtype=dtype, use_safetensors=True, variant="fp16", token=huggingface_token).to(device) # Initialize Florence model florence_model = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True).to(device).eval() florence_processor = AutoProcessor.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True) # Prompt Enhancer enhancer_long = pipeline("summarization", model="gokaygokay/Lamini-Prompt-Enchance-Long", device=device) MAX_SEED = np.iinfo(np.int32).max MAX_IMAGE_SIZE = 1024 hf_hub_download( repo_id="stabilityai/stable-diffusion-3.5-large-turbo", filename="LICENSE.md", local_dir = "./models", token = huggingface_token ) # Initialize LLMInferenceNode llm_node = LLMInferenceNode() # Florence caption function @spaces.GPU def florence_caption(image): # Convert image to PIL if it's not already if not isinstance(image, Image.Image): image = Image.fromarray(image) inputs = florence_processor(text="", images=image, return_tensors="pt").to(device) generated_ids = florence_model.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, early_stopping=False, do_sample=False, num_beams=3, ) generated_text = florence_processor.batch_decode(generated_ids, skip_special_tokens=False)[0] parsed_answer = florence_processor.post_process_generation( generated_text, task="", image_size=(image.width, image.height) ) return parsed_answer[""] # Prompt Enhancer function @spaces.GPU def enhance_prompt(input_prompt): result = enhancer_long("Enhance the description: " + input_prompt) enhanced_text = result[0]['summary_text'] return enhanced_text @spaces.GPU(duration=75) def process_workflow(image, text_prompt, use_enhancer, use_llm_generator, llm_provider, llm_model, prompt_type, seed, randomize_seed, width, height, guidance_scale, num_inference_steps, negative_prompt="", progress=gr.Progress(track_tqdm=True)): if image is not None: # Convert image to PIL if it's not already if not isinstance(image, Image.Image): image = Image.fromarray(image) caption = florence_caption(image) print(f"Florence caption: {caption}") if use_llm_generator: prompt = generate_llm_prompt(caption, llm_provider, llm_model, prompt_type) else: prompt = caption else: prompt = text_prompt if use_enhancer: prompt = enhance_prompt(prompt) if randomize_seed: seed = random.randint(0, MAX_SEED) generator = torch.Generator(device=device).manual_seed(seed) image = pipe( prompt=prompt, negative_prompt=negative_prompt, generator=generator, num_inference_steps=num_inference_steps, width=width, height=height, guidance_scale=guidance_scale ).images[0] return image, prompt, seed def generate_llm_prompt(input_text, provider, model, prompt_type): try: dynamic_seed = random.randint(0, 1000000) result = llm_node.generate( input_text=input_text, long_talk=True, compress=False, compression_level="medium", poster=False, prompt_type=prompt_type, provider=provider, model=model ) return result except Exception as e: print(f"An error occurred in generate_llm_prompt: {e}") return input_text # Return original input if there's an error title = """

Stable Diffusion 3.5 with Florence-2 Captioner and Prompt Enhancer

[Stable Diffusion 3.5 Model] [Florence-2 Model] [Prompt Enhancer Long]

Create long prompts from images or enhance your short prompts with prompt enhancer

""" with gr.Blocks(theme='bethecloud/storj_theme') as demo: gr.HTML(title) with gr.Row(): with gr.Column(scale=1): with gr.Group(elem_classes="input-group"): input_image = gr.Image(label="Input Image (Florence-2 Captioner)", height=512) with gr.Accordion("Image Settings", open=False): width = gr.Slider(label="Width", minimum=512, maximum=MAX_IMAGE_SIZE, step=32, value=1024) height = gr.Slider(label="Height", minimum=512, maximum=MAX_IMAGE_SIZE, step=32, value=1024) guidance_scale = gr.Slider(label="Guidance Scale", minimum=0.0, maximum=7.5, step=0.1, value=4.5) num_inference_steps = gr.Slider(label="Inference Steps", minimum=1, maximum=50, step=1, value=40) seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0) randomize_seed = gr.Checkbox(label="Randomize Seed", value=True) negative_prompt = gr.Textbox(label="Negative Prompt") with gr.Column(scale=1): with gr.Group(elem_classes="input-group"): text_prompt = gr.Textbox(label="Text Prompt (optional, used if no image is uploaded)") use_enhancer = gr.Checkbox(label="Use Prompt Enhancer", value=False) use_llm_generator = gr.Checkbox(label="Use LLM Prompt Generator", value=False) with gr.Accordion("LLM Settings", open=False): llm_provider = gr.Dropdown( choices=["Hugging Face", "SambaNova"], label="LLM Provider", value="Hugging Face", visible=False ) llm_model = gr.Dropdown( label="LLM Model", choices=["Qwen/Qwen2.5-72B-Instruct", "meta-llama/Meta-Llama-3.1-70B-Instruct", "mistralai/Mixtral-8x7B-Instruct-v0.1", "mistralai/Mistral-7B-Instruct-v0.3"], value="Qwen/Qwen2.5-72B-Instruct", visible=False ) prompt_type = gr.Dropdown( choices=["Random", "Long", "Short", "Medium", "OnlyObjects", "NoFigure", "Landscape", "Fantasy"], label="Prompt Type", value="Random", visible=False ) generate_prompt_btn = gr.Button("Generate Prompt", elem_classes="submit-btn") final_prompt = gr.Textbox(label="Final Prompt", interactive=False) generate_btn = gr.Button("Generate Image", elem_classes="submit-btn") with gr.Column(scale=1): with gr.Group(elem_classes="output-group"): output_image = gr.Image(label="Result", elem_id="gallery", show_label=False) used_seed = gr.Number(label="Seed Used") def update_model_choices(provider): provider_models = { "Hugging Face": [ "Qwen/Qwen2.5-72B-Instruct", "meta-llama/Meta-Llama-3.1-70B-Instruct", "mistralai/Mixtral-8x7B-Instruct-v0.1", "mistralai/Mistral-7B-Instruct-v0.3" ], "SambaNova": [ "Meta-Llama-3.1-70B-Instruct", "Meta-Llama-3.1-405B-Instruct", "Meta-Llama-3.1-8B-Instruct" ], } models = provider_models.get(provider, []) return gr.Dropdown(choices=models, value=models[0] if models else "") def update_llm_visibility(use_llm): return { llm_provider: gr.update(visible=use_llm), llm_model: gr.update(visible=use_llm), prompt_type: gr.update(visible=use_llm) } use_llm_generator.change( update_llm_visibility, inputs=[use_llm_generator], outputs=[llm_provider, llm_model, prompt_type] ) llm_provider.change( update_model_choices, inputs=[llm_provider], outputs=[llm_model] ) def generate_prompt(image, text_prompt, use_enhancer, use_llm_generator, llm_provider, llm_model, prompt_type): if image is not None: caption = florence_caption(image) initial_prompt = caption else: initial_prompt = text_prompt if use_llm_generator: prompt = generate_llm_prompt(initial_prompt, llm_provider, llm_model, prompt_type) else: prompt = initial_prompt if use_enhancer: prompt = enhance_prompt(prompt) return prompt generate_prompt_btn.click( fn=generate_prompt, inputs=[ input_image, text_prompt, use_enhancer, use_llm_generator, llm_provider, llm_model, prompt_type ], outputs=[final_prompt] ) generate_btn.click( fn=process_workflow, inputs=[ input_image, final_prompt, use_enhancer, use_llm_generator, llm_provider, llm_model, prompt_type, seed, randomize_seed, width, height, guidance_scale, num_inference_steps, negative_prompt ], outputs=[output_image, final_prompt, used_seed] ) demo.launch(debug=True)