import spaces import gradio as gr import torch from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor, pipeline import re import random import os from huggingface_hub import snapshot_download from kolors.pipelines.pipeline_stable_diffusion_xl_chatglm_256 import StableDiffusionXLPipeline from kolors.models.modeling_chatglm import ChatGLMModel from kolors.models.tokenization_chatglm import ChatGLMTokenizer from diffusers import UNet2DConditionModel, AutoencoderKL from diffusers import EulerDiscreteScheduler # Initialize models device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.float16 # Download Kolors model ckpt_dir = snapshot_download(repo_id="Kwai-Kolors/Kolors") # Load Kolors models text_encoder = ChatGLMModel.from_pretrained(os.path.join(ckpt_dir, 'text_encoder'), torch_dtype=dtype).to(device) tokenizer = ChatGLMTokenizer.from_pretrained(os.path.join(ckpt_dir, 'text_encoder')) vae = AutoencoderKL.from_pretrained(os.path.join(ckpt_dir, "vae"), revision=None).to(dtype).to(device) scheduler = EulerDiscreteScheduler.from_pretrained(os.path.join(ckpt_dir, "scheduler")) unet = UNet2DConditionModel.from_pretrained(os.path.join(ckpt_dir, "unet"), revision=None).to(dtype).to(device) kolors_pipe = StableDiffusionXLPipeline( vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=scheduler, force_zeros_for_empty_prompt=False ).to(device) kolors_pipe.enable_model_cpu_offload() # VLM Captioner vlm_model = PaliGemmaForConditionalGeneration.from_pretrained("gokaygokay/sd3-long-captioner-v2").to(device).eval() vlm_processor = PaliGemmaProcessor.from_pretrained("gokaygokay/sd3-long-captioner-v2") # Prompt Enhancer enhancer_medium = pipeline("summarization", model="gokaygokay/Lamini-Prompt-Enchance", device=device) enhancer_long = pipeline("summarization", model="gokaygokay/Lamini-Prompt-Enchance-Long", device=device) MAX_SEED = 2**32 - 1 # VLM Captioner function def create_captions_rich(image): prompt = "caption en" model_inputs = vlm_processor(text=prompt, images=image, return_tensors="pt").to(device) input_len = model_inputs["input_ids"].shape[-1] with torch.inference_mode(): generation = vlm_model.generate(**model_inputs, repetition_penalty=1.10, max_new_tokens=256, do_sample=False) generation = generation[0][input_len:] decoded = vlm_processor.decode(generation, skip_special_tokens=True) return modify_caption(decoded) # Helper function for caption modification def modify_caption(caption: str) -> str: prefix_substrings = [ ('captured from ', ''), ('captured at ', '') ] pattern = '|'.join([re.escape(opening) for opening, _ in prefix_substrings]) replacers = {opening: replacer for opening, replacer in prefix_substrings} def replace_fn(match): return replacers[match.group(0)] return re.sub(pattern, replace_fn, caption, count=1, flags=re.IGNORECASE) # Prompt Enhancer function def enhance_prompt(input_prompt, model_choice): if model_choice == "Medium": result = enhancer_medium("Enhance the description: " + input_prompt) enhanced_text = result[0]['summary_text'] pattern = r'^.*?of\s+(.*?(?:\.|$))' match = re.match(pattern, enhanced_text, re.IGNORECASE | re.DOTALL) if match: remaining_text = enhanced_text[match.end():].strip() modified_sentence = match.group(1).capitalize() enhanced_text = modified_sentence + ' ' + remaining_text else: # Long result = enhancer_long("Enhance the description: " + input_prompt) enhanced_text = result[0]['summary_text'] return enhanced_text def generate_image(prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps): if randomize_seed: seed = random.randint(0, MAX_SEED) generator = torch.Generator(device=device).manual_seed(seed) image = kolors_pipe( prompt=prompt, negative_prompt=negative_prompt, guidance_scale=guidance_scale, num_inference_steps=num_inference_steps, width=width, height=height, generator=generator ).images[0] return image, seed # Gradio Interface @spaces.GPU def process_workflow(image, text_prompt, use_vlm, use_enhancer, model_choice, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps): if use_vlm and image is not None: prompt = create_captions_rich(image) else: prompt = text_prompt if use_enhancer: prompt = enhance_prompt(prompt, model_choice) generated_image, used_seed = generate_image(prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps) return generated_image, prompt, used_seed custom_css = """ .input-group, .output-group { border: 1px solid #e0e0e0; border-radius: 10px; padding: 20px; margin-bottom: 20px; background-color: #f9f9f9; } .submit-btn { background-color: #2980b9 !important; color: white !important; } .submit-btn:hover { background-color: #3498db !important; } """ title = """
Don't forget to click Use VLM Captioner or Use Prompt Enhancer Buttons!