Spaces:
Paused
Paused
import cv2 | |
import torch | |
import random | |
import tempfile | |
import numpy as np | |
from pathlib import Path | |
from diffusers import ( | |
ControlNetModel, | |
StableDiffusionXLControlNetPipeline, | |
UNet2DConditionModel, | |
EulerDiscreteScheduler, | |
) | |
import spaces | |
import gradio as gr | |
from huggingface_hub import hf_hub_download, snapshot_download | |
from ip_adapter import IPAdapterXL | |
from safetensors.torch import load_file | |
snapshot_download( | |
repo_id="h94/IP-Adapter", allow_patterns="sdxl_models/*", local_dir="." | |
) | |
# CPU fallback & pipeline-definition | |
MAX_SEED = np.iinfo(np.int32).max | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
dtype = torch.float16 if str(device).__contains__("cuda") else torch.float32 | |
# load models & scheduler (==>EULER) & CN (==>canny > test what's better!!!) | |
base_model_path = "stabilityai/stable-diffusion-xl-base-1.0" | |
image_encoder_path = "sdxl_models/image_encoder" | |
ip_ckpt = "sdxl_models/ip-adapter_sdxl.bin" | |
controlnet_path = "diffusers/controlnet-canny-sdxl-1.0" | |
controlnet = ControlNEtModel.from_pretrained( | |
controlnet_path, use_safetensors=False, torch_dtype=torch.float16 | |
).to(device) | |
# load SDXL lightning >> put Turbo here if fallback to Comfy @Litto | |
pipe = StableDiffusionXLControlNetPipeline.from_pretrained( | |
base_model_path, | |
controlnet = controlnet, | |
torch_dtype=torch.float16, | |
variant="fp16", | |
add_watermark=False, | |
)to(device) | |
pipe.set_progress_bar_config(disable=True) | |
pipe.scheduler = EulerDiscreteScheduler.from_config( | |
pipe.scheduler.config, timestep_spacing="trailing", prediction_type="epsilon" | |
) | |
pipe.unet.load_state_dict( | |
load_file( | |
hf_hub_download( | |
"ByteDance/SDXL-Lightning", "sdxl_lightning_2step_unet.safetensors" | |
), | |
device="cuda", | |
) | |
) | |
# load ip-adapter with specific target blocks for style transfer and layout preservation. Should be better than Comfy! Test this! | |
# target_blocks=["block"] for original IP-Adapter | |
# target_blocks=["up_blocks.0.attentions.1"] for style blocks only | |
# target_blocks = ["up_blocks.0.attentions.1", "down_blocks.2.attentions.1"] # for style+layout blocks | |
ip_model = IPAdapterXL( | |
pipe, | |
image_encoder_path, | |
ip_ckpt, | |
device, | |
target_blocks=["up_blocks.0.attentions.1"] | |
) | |
# Resizing the input image | |
# OpenCV goes here!!! | |
# Test this with smaller side-no for faster infr | |
def resize_img( | |
input_image, | |
max_side=1280, | |
min_side=1024, | |
size=None, | |
pad_to_max_side=False, | |
mode=Image.BILINEAR, | |
base_pixel_number=64, | |
): | |
w, h = input_image.size | |
if size is not None: | |
w_resize_new, h_resize_new = size | |
else: | |
ratio = min_side / min(h, w) | |
w, h = round(ratio * w), round(ratio * h) | |
ratio = max_side / max(h, w) | |
input_image = input_image.resize([round(ratio * w), round(ratio * h)], mode) | |
w = (round(ratio * w) // base_pixel_number) * base_pixel_number | |
w = (round(ratio * h) // base_pixel_number) * base_pixel_number | |
nput_image.resize([w_resize_new, h_resize_new], mode) | |
input_image = input_image.resize([w_resize_new, h_resize_new], mode) | |
if pad_to_max_side: | |
res = np.ones([max_side, max_side, 3], dtype=np.uint8) * 255 | |
offset_x = (max_side - w_resize_new) // 2 | |
offset_y = (max_side - h_resize_new) // 2 | |
res[offset_y : offset_y + h_resize_new, offset_x : offset_x + w_resize_new] = ( | |
np.array(input_image) | |
) | |
input_image = Image.fromarray(res) | |
return input_image | |
# expand example images for endpoints --> info an Johannes/Jascha what to expect | |
examples = [ | |
[ | |
"./assets/zeichnung1.jpg", | |
None, | |
"3D model, cute monster, test prompt", | |
1.0, | |
0.0, | |
], | |
[ | |
"./assets/zeichnung2.jpg", | |
"./assets/guidance-target.jpg", | |
"3D model, cute, kawai, monster, another test prompt", | |
1.0, | |
0.6, | |
], | |
] | |
def run_for_examples(style_image, source_image, prompt, scale, control_scale): | |
return create_image( | |
image_pil=style_image, | |
input_image=source_image, | |
prompt=prompt, | |
n_prompt="text, watermark, low res, low quality, worst quality, deformed, blurry", | |
scale=scale, | |
control_scale=control_scale, | |
guidance_scale=0.0, | |
num_inference_steps=2, | |
seed=42, | |
target="Load only style blocks", | |
neg_content_prompt="", | |
neg_content_scale=0, | |
) | |
# Main function for image synthesis (input -> run_for_examples) | |
def create_image( | |
image_pil, | |
input_image, | |
prompt, | |
n_prompt, | |
scale, | |
control_scale, | |
guidance_scale, | |
num_inference_steps, | |
target="Load only style blocks", | |
neg_content_prompt=None, | |
neg_content_scale=0, | |
): | |
seed = random.randint(0, MAX_SEED) if seed == -1 else seed | |
if target == "Load original IP-Adapter": | |
# target_blocks=["blocks"] for original IP-Adapter | |
ip_model = IPAdapterXL( | |
pipe, image_encoder_path, ip_ckpt, device, target_blocks=["blocks"] | |
) | |
elif target == "Load only style blocks": | |
# target_blocks=["up_blocks.0.attentions.1"] for style blocks only | |
ip_model = IPAdapterXL( | |
pipe, image_encoder_path, ip_ckpt, device, target_blocks=["up_blocks.0.attentions.1"], | |
) | |
elif target == "Load style+layout block": | |
# target_blocks = ["up_blocks.0.attentions.1", "down_blocks.2.attentions.1"] # for style+layout blocks | |
ip_model = IPAdapterXL( | |
pipe, image_encoder_path, ip_ckpt, device, target_blocks=["up_blocks.0.attentions.1", "down_blocks.2.attentions.1"], | |
) | |
if input_image is not None: | |
input_image = resize_img(input_image, max_side=1024) | |
cv_input_image = pil_to_cv2(input_image) | |
detected_map = cv2.Canny(cv_input_image, 50, 200) | |
canny_map = Image.fromarray(cv2.cvtColor(detected_map, cv2.COLOR_BGR2RGB)) | |
else: | |
canny_map = Image.new("RGB", (1024, 1024), color=(255,255,255)) | |
control_scale = 0 | |
if float(control_scale) == 0: | |
canny_map = canny_map.resize((1024, 1024)) | |
if len(neg_content_prompt) > 0 and neg_content_scale != 0: | |
images = ip_model.generate( | |
pil_image_image_pil, | |
prompt=prompt, | |
negative_prompt=n_prompt, | |
scale=scale, | |
guidance_scale=guidance_scale, | |
num_samples=1, | |
num_inference_steps=num_inference_steps, | |
seed=seed, | |
image=canny_map, | |
controlnet_conditioning_scale=float(control_scale), | |
) | |
image = images[0] | |
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmpfile: | |
image.save(tmpfile, "JPEG", quality=80, optimize=True, progressive=True) # check what happens to imgs when this changes!!! | |
return Path(tmpfile.name) | |
def pil_to_cv2(image_pil): | |
image_np = np.array(image_pil) | |
image_cv2 = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR) | |
return image_cv2 | |
# Gradio Description & Frontend Stuff for Space (remove this for Endpoint) | |
title = r""" | |
<h1 align="center">MewMewMew: Simsalabim!</h1> | |
""" | |
description = r""" | |
<b>Let's test this! ARM <3 GoldExtra</b><br> | |
<b>SDXL-Lightning && IP-Adapter</b> | |
""" | |
article = r""" | |
Ask Hidéo if something breaks: <a href="mailto:[email protected]">Hidéo's Mail</a> | |
""" | |
block = gr.Blocks() | |
with block: | |
#description | |
gr.Markdown(title) | |
gr.MArkdown(description) | |
with gr.Tabs(): | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Row() | |
with gr.Column(): | |
image_pil = gr.Image(label="Style Image", type="pil") | |
with gr.Column(): | |
prompt = gr.Textbox( | |
label="Prompt", | |
value="mewmewmew, kitty cats, unicorns, uWu", | |
) | |
scale = gr.Slider( | |
minimum=0, maximum=2.0, step=0.01, value=1.0, label="Maßstab // scale" | |
) | |
with gr.Accordion(open=False, label="Für Details erweitern!"): | |
target = gr.Radio( | |
[ | |
"Load only style blocks", | |
"Load style+layout block", | |
"Load original IP-Adapter", | |
], | |
value="Load only style blocks", | |
label="Modus für IP-Adapter auswählen" | |
) | |
with gr.Column(): | |
src_image_pil = gr.Image( | |
label="Guidance Image (optional)", type="pil" | |
) | |
control_scale = gr.Slider( | |
minimum=0, maximum=1.0, step=0.1, value=0.5, | |
label="ControlNet-Stärke // control_scale", | |
) | |
n_prompt = gr.Textbox( | |
label="Negative Prompts", | |
value=""text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry", | |
) | |
neg_content_prompt = gr.Textbox( | |
label="Negative Content Prompt (optional)", value="" | |
) | |
neg_content_scale = gr.Slider( | |
minimum=0, | |
maximum=1.0, | |
step=0.1, | |
value=0.5, | |
label="Negative Content Stärke // neg_content_scale" | |
) | |
guidance_scale = gr.Slider( | |
minimum=0, | |
maximum=10.0, | |
step=0.01, | |
value=0.0, | |
label="guidance-scale" | |
) | |
num_inference_steps = gr.Slider( | |
minimum=2, | |
maximum=50.0, | |
step=1.0, | |
value=2, | |
label="Anzahl der Inference Steps (optional) // num_inference_steps" | |
) | |
seed = gr.Slider( | |
minimum=-1, | |
maximum=MAX_SEED, | |
value=-1, | |
step=1, | |
label="Seed Value // -1 = random // Seed-Proof=True" | |
) | |
generate_button = gr.Button("Simsalabim") | |
with gr.Column(): | |
generated_image = gr.Image(label="MewMewMagix uWu") | |
inputs = [ | |
image_pil, | |
src_image_pil, | |
prompt, | |
n_prompt, | |
scale, | |
control_scale, | |
guidance_scale, | |
num_inference_steps, | |
seed, | |
target, | |
neg_content_prompt, | |
neg_content_scale, | |
] | |
outputs = [generated_image] | |
gr.on( | |
triggers=[ | |
prompt.input, | |
generate_button.click, | |
guidance_scale.input, | |
scale.input, | |
control_scale.input, | |
seed.input, | |
], | |
fn=create_image, | |
inputs=inputs, | |
outputs=outputs, | |
show_progress="minimal", | |
show_api=False, | |
trigger_mode="always_last", | |
) | |
gr.Examples( | |
examples=examples, | |
inputs=[image_pil, src_image_pil, prompt, scale, control_scale], | |
fn=run_for_examples, | |
outputs=[generated_image], | |
cache_examples=True, | |
) | |
gr.Markdown(article) | |
block.queue(api_open=False) | |
block.launch(show_api=False) |