Spaces:

ysmao
/

Layout-Control

Running on Zero

App Files Files Community

ysmao commited on May 28

Commit

4342954

•

1 Parent(s): e3e1936

add layout controlnet

Browse files

Files changed (5) hide show

annotator/dsine_hub.py +37 -0
annotator/midas.py +34 -0
annotator/upernet.py +190 -0
annotator/util.py +38 -0
app.py +147 -4

annotator/dsine_hub.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import torch
+import numpy as np
+from PIL import Image
+class NormalDetector:
+    def __init__(self):
+        self.model_path = "hugoycj/DSINE-hub"
+        self.dsine = torch.hub.load(self.model_path, "DSINE", trust_repo=True)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    @torch.no_grad()
+    def __call__(self, image):
+        self.dsine.model.to(self.device)
+        self.dsine.model.pixel_coords = self.dsine.model.pixel_coords.to(self.device)
+        H, W, C = image.shape
+        normal = self.dsine.infer_pil(image)[0]  # Output shape: (H, W, 3)
+        normal = (normal + 1.0) / 2.0  # Convert values to the range [0, 1]
+        normal = (normal * 255).cpu().numpy().astype(np.uint8).transpose(1, 2, 0)
+        normal_img = Image.fromarray(normal).resize((W, H))
+        self.dsine.model.to("cpu")
+        self.dsine.model.pixel_coords = self.dsine.model.pixel_coords.to("cpu")
+        return normal_img
+if __name__ == "__main__":
+    from diffusers.utils import load_image
+    image = load_image(
+        "https://qhstaticssl.kujiale.com/image/jpeg/1716177580588/9AAA49344B9CE33512C4EBD0A287495F.jpg"
+    )
+    image = np.asarray(image)
+    normal_detector = NormalDetector()
+    normal_image = normal_detector(image)
+    normal_image.save("normal_image.jpg")

annotator/midas.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+import numpy as np
+from PIL import Image
+from transformers import DPTFeatureExtractor
+from transformers import DPTForDepthEstimation
+class DepthDetector:
+    def __init__(self, model_path=None):
+        if model_path is not None:
+            self.model_path = model_path
+        else:
+            self.model_path = "Intel/dpt-hybrid-midas"
+        self.model = DPTForDepthEstimation.from_pretrained(self.model_path)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.feature_extractor = DPTFeatureExtractor.from_pretrained(self.model_path)
+    @torch.no_grad()
+    def __call__(self, image):
+        self.model.to(self.device)
+        H, W, C = image.shape
+        inputs = self.feature_extractor(images=image, return_tensors="pt")
+        inputs["pixel_values"] = inputs["pixel_values"].to(self.device)
+        outputs = self.model(**inputs)
+        predicted_depth = outputs.predicted_depth
+        outputs = predicted_depth.squeeze().cpu().numpy()
+        if len(outputs.shape) == 2:
+            output = outputs[np.newaxis, :, :]
+        else:
+            output = outputs
+        formatted = (output * 255 / np.max(output)).astype("uint8")
+        depth_image = Image.fromarray(formatted[0, ...]).resize((W, H))
+        self.model.to("cpu")
+        return depth_image

annotator/upernet.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import torch
+import numpy as np
+from PIL import Image
+from transformers import AutoImageProcessor
+from transformers import UperNetForSemanticSegmentation
+class SegmDetector:
+    def __init__(self, model_path=None):
+        if model_path is not None:
+            self.model_path = model_path
+        else:
+            self.model_path = "openmmlab/upernet-convnext-small"
+        self.model = UperNetForSemanticSegmentation.from_pretrained(self.model_path)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.feature_extractor = AutoImageProcessor.from_pretrained(self.model_path)
+        self.palette = [
+            [120, 120, 120],
+            [180, 120, 120],
+            [6, 230, 230],
+            [80, 50, 50],
+            [4, 200, 3],
+            [120, 120, 80],
+            [140, 140, 140],
+            [204, 5, 255],
+            [230, 230, 230],
+            [4, 250, 7],
+            [224, 5, 255],
+            [235, 255, 7],
+            [150, 5, 61],
+            [120, 120, 70],
+            [8, 255, 51],
+            [255, 6, 82],
+            [143, 255, 140],
+            [204, 255, 4],
+            [255, 51, 7],
+            [204, 70, 3],
+            [0, 102, 200],
+            [61, 230, 250],
+            [255, 6, 51],
+            [11, 102, 255],
+            [255, 7, 71],
+            [255, 9, 224],
+            [9, 7, 230],
+            [220, 220, 220],
+            [255, 9, 92],
+            [112, 9, 255],
+            [8, 255, 214],
+            [7, 255, 224],
+            [255, 184, 6],
+            [10, 255, 71],
+            [255, 41, 10],
+            [7, 255, 255],
+            [224, 255, 8],
+            [102, 8, 255],
+            [255, 61, 6],
+            [255, 194, 7],
+            [255, 122, 8],
+            [0, 255, 20],
+            [255, 8, 41],
+            [255, 5, 153],
+            [6, 51, 255],
+            [235, 12, 255],
+            [160, 150, 20],
+            [0, 163, 255],
+            [140, 140, 140],
+            [250, 10, 15],
+            [20, 255, 0],
+            [31, 255, 0],
+            [255, 31, 0],
+            [255, 224, 0],
+            [153, 255, 0],
+            [0, 0, 255],
+            [255, 71, 0],
+            [0, 235, 255],
+            [0, 173, 255],
+            [31, 0, 255],
+            [11, 200, 200],
+            [255, 82, 0],
+            [0, 255, 245],
+            [0, 61, 255],
+            [0, 255, 112],
+            [0, 255, 133],
+            [255, 0, 0],
+            [255, 163, 0],
+            [255, 102, 0],
+            [194, 255, 0],
+            [0, 143, 255],
+            [51, 255, 0],
+            [0, 82, 255],
+            [0, 255, 41],
+            [0, 255, 173],
+            [10, 0, 255],
+            [173, 255, 0],
+            [0, 255, 153],
+            [255, 92, 0],
+            [255, 0, 255],
+            [255, 0, 245],
+            [255, 0, 102],
+            [255, 173, 0],
+            [255, 0, 20],
+            [255, 184, 184],
+            [0, 31, 255],
+            [0, 255, 61],
+            [0, 71, 255],
+            [255, 0, 204],
+            [0, 255, 194],
+            [0, 255, 82],
+            [0, 10, 255],
+            [0, 112, 255],
+            [51, 0, 255],
+            [0, 194, 255],
+            [0, 122, 255],
+            [0, 255, 163],
+            [255, 153, 0],
+            [0, 255, 10],
+            [255, 112, 0],
+            [143, 255, 0],
+            [82, 0, 255],
+            [163, 255, 0],
+            [255, 235, 0],
+            [8, 184, 170],
+            [133, 0, 255],
+            [0, 255, 92],
+            [184, 0, 255],
+            [255, 0, 31],
+            [0, 184, 255],
+            [0, 214, 255],
+            [255, 0, 112],
+            [92, 255, 0],
+            [0, 224, 255],
+            [112, 224, 255],
+            [70, 184, 160],
+            [163, 0, 255],
+            [153, 0, 255],
+            [71, 255, 0],
+            [255, 0, 163],
+            [255, 204, 0],
+            [255, 0, 143],
+            [0, 255, 235],
+            [133, 255, 0],
+            [255, 0, 235],
+            [245, 0, 255],
+            [255, 0, 122],
+            [255, 245, 0],
+            [10, 190, 212],
+            [214, 255, 0],
+            [0, 204, 255],
+            [20, 0, 255],
+            [255, 255, 0],
+            [0, 153, 255],
+            [0, 41, 255],
+            [0, 255, 204],
+            [41, 0, 255],
+            [41, 255, 0],
+            [173, 0, 255],
+            [0, 245, 255],
+            [71, 0, 255],
+            [122, 0, 255],
+            [0, 255, 184],
+            [0, 92, 255],
+            [184, 255, 0],
+            [0, 133, 255],
+            [255, 214, 0],
+            [25, 194, 194],
+            [102, 255, 0],
+            [92, 0, 255],
+        ]
+    @torch.no_grad()
+    def __call__(self, image):
+        self.model.to(self.device)
+        H, W, C = image.shape
+        pixel_values = self.feature_extractor(
+            images=image, return_tensors="pt"
+        ).pixel_values
+        pixel_values = pixel_values.to(self.device)
+        outputs = self.model(pixel_values)
+        segm_image = self.feature_extractor.post_process_semantic_segmentation(outputs)
+        segm_image = segm_image[0].cpu()
+        color_seg = np.zeros(
+            (segm_image.shape[0], segm_image.shape[1], 3), dtype=np.uint8
+        )
+        for label, color in enumerate(self.palette):
+            color_seg[segm_image == label, :] = color
+        color_seg = color_seg.astype(np.uint8)
+        segm_image = Image.fromarray(color_seg).resize((W, H))
+        self.model.to("cpu")
+        return segm_image

annotator/util.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import numpy as np
+import cv2
+def HWC3(x):
+    assert x.dtype == np.uint8
+    if x.ndim == 2:
+        x = x[:, :, None]
+    assert x.ndim == 3
+    H, W, C = x.shape
+    assert C == 1 or C == 3 or C == 4
+    if C == 3:
+        return x
+    if C == 1:
+        return np.concatenate([x, x, x], axis=2)
+    if C == 4:
+        color = x[:, :, 0:3].astype(np.float32)
+        alpha = x[:, :, 3:4].astype(np.float32) / 255.0
+        y = color * alpha + 255.0 * (1.0 - alpha)
+        y = y.clip(0, 255).astype(np.uint8)
+        return y
+def resize_image(input_image, resolution):
+    H, W, C = input_image.shape
+    H = float(H)
+    W = float(W)
+    k = float(resolution) / max(H, W)
+    H *= k
+    W *= k
+    H = int(np.round(H / 64.0)) * 64
+    W = int(np.round(W / 64.0)) * 64
+    img = cv2.resize(
+        input_image,
+        (W, H),
+        interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA,
+    )
+    return img

app.py CHANGED Viewed

@@ -1,7 +1,150 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import torch
+import spaces
+import numpy as np
+from diffusers import (
+    ControlNetModel,
+    StableDiffusionControlNetPipeline,
+    UniPCMultistepScheduler,
+)
 import gradio as gr
+from annotator.util import resize_image, HWC3
+from annotator.midas import DepthDetector
+from annotator.dsine_hub import NormalDetector
+from annotator.upernet import SegmDetector
+controlnet_checkpoint = "kujiale-ai/controlnet"
+# Initialize pipeline
+controlnet = ControlNetModel.from_pretrained(
+    controlnet_checkpoint,
+    subfolder="control_v1_sd15_layout_fp16",
+    torch_dtype=torch.float16,
+)
+pipe = StableDiffusionControlNetPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
+).to("cuda")
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+apply_depth = DepthDetector()
+apply_normal = NormalDetector()
+apply_segm = SegmDetector()
+@spaces.GPU(duration=10)
+def generate(
+    input_image,
+    prompt,
+    a_prompt,
+    n_prompt,
+    num_samples,
+    image_resolution,
+    steps,
+    strength,
+    guidance_scale,
+    seed,
+):
+    color_image = resize_image(HWC3(input_image), image_resolution)
+    # set seed
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    with torch.no_grad():
+        depth_image = apply_depth(color_image)
+        normal_image = apply_normal(color_image)
+        segm_image = apply_segm(color_image)
+        # Prepare Layout Control Image
+        depth_image = np.array(depth_image, dtype=np.float32) / 255.0
+        depth_image = torch.from_numpy(depth_image[:, :, None])[None].permute(
+            0, 3, 1, 2
+        )
+        normal_image = np.array(normal_image, dtype=np.float32)
+        normal_image = normal_image / 127.5 - 1.0
+        normal_image = torch.from_numpy(normal_image)[None].permute(0, 3, 1, 2)
+        segm_image = np.array(segm_image, dtype=np.float32) / 255.0
+        segm_image = torch.from_numpy(segm_image)[None].permute(0, 3, 1, 2)
+        control_image = torch.cat([depth_image, normal_image, segm_image], dim=1)
+    generator = torch.Generator(device="cuda").manual_seed(seed)
+    images = pipe(
+        prompt + a_prompt,
+        negative_prompt=n_prompt,
+        num_images_per_prompt=num_samples,
+        num_inference_steps=steps,
+        image=control_image,
+        generator=generator,
+        guidance_scale=guidance_scale,
+        controlnet_conditioning_scale=strength,
+    ).images
+    return images
+block = gr.Blocks().queue()
+with block:
+    with gr.Row():
+        gr.Markdown("## KuJiaLe Layout ControlNet Demo")
+    with gr.Row():
+        input_image = gr.Image(source="upload", type="numpy", label="input_image")
+    with gr.Row():
+        prompt = gr.Textbox(label="Prompt")
+    with gr.Row():
+        run_button = gr.Button(label="Run")
+    with gr.Row():
+        with gr.Column():
+            with gr.Accordion("Advanced options", open=False):
+                num_samples = gr.Slider(
+                    label="Images", minimum=1, maximum=2, value=1, step=1
+                )
+                image_resolution = gr.Slider(
+                    label="Image Resolution",
+                    minimum=512,
+                    maximum=768,
+                    value=768,
+                    step=64,
+                )
+                strength = gr.Slider(
+                    label="Control Strength",
+                    minimum=0.0,
+                    maximum=2.0,
+                    value=1,
+                    step=0.1,
+                )
+                steps = gr.Slider(
+                    label="Steps", minimum=1, maximum=50, value=25, step=1
+                )
+                guidance_scale = gr.Slider(
+                    label="Guidance Scale",
+                    minimum=0.1,
+                    maximum=20.0,
+                    value=7.5,
+                    step=0.1,
+                )
+                seed = gr.Slider(
+                    label="Seed", minimum=-1, maximum=2147483647, value=1, step=1
+                )
+                a_prompt = gr.Textbox(
+                    label="Added Prompt", value="best quality, extremely detailed"
+                )
+                n_prompt = gr.Textbox(
+                    label="Negative Prompt",
+                    value="longbody, lowres, bad anatomy, human, extra digit, fewer digits, cropped, worst quality, low quality",
+                )
+    with gr.Row():
+        image_gallery = gr.Gallery(
+            label="Output", show_label=False, elem_id="gallery"
+        ).style(grid=1, height="auto")
+    ips = [
+        input_image,
+        prompt,
+        a_prompt,
+        n_prompt,
+        num_samples,
+        image_resolution,
+        steps,
+        strength,
+        guidance_scale,
+        seed,
+    ]
+    run_button.click(fn=generate, inputs=ips, outputs=[image_gallery])