import gradio as gr import spaces import time import cv2 from diffusers import AutoPipelineForInpainting from transformers import pipeline from ultralytics import YOLO from PIL import Image import numpy as np import torch import base64 from io import BytesIO import difflib # Constants DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' # Helper functions def image_to_base64(image: Image.Image): buffered = BytesIO() image.save(buffered, format="JPEG") return base64.b64encode(buffered.getvalue()).decode("utf-8") def get_most_similar_string(target_string, string_array): best_match = string_array[0] best_match_ratio = 0 for candidate_string in string_array: similarity_ratio = difflib.SequenceMatcher(None, target_string, candidate_string).ratio() if similarity_ratio > best_match_ratio: best_match = candidate_string best_match_ratio = similarity_ratio return best_match # Load YOLO model on CPU yoloModel = YOLO('yolov8x-seg.pt') yoloModel.to('cpu') def getClasses(model, img1): results = model([img1]) out = [] for r in results: im_array = r.plot() out.append(r) return r, im_array[..., ::-1], results def getMasks(out): allout = {} class_masks = {} for a in out: class_name = a['name'] mask = a['img'] if class_name in class_masks: class_masks[class_name] = Image.fromarray( np.maximum(np.array(class_masks[class_name]), np.array(mask)) ) else: class_masks[class_name] = mask for class_name, mask in class_masks.items(): allout[class_name] = mask return allout def joinClasses(classes): out = [] for r in classes: masks = r.masks name0 = r.names[int(r.boxes.cls.cpu().numpy()[0])] mask1 = masks[0] mask = mask1.data[0].cpu().numpy() mask_normalized = ((mask - mask.min()) * (255 / (mask.max() - mask.min()))).astype(np.uint8) kernel = np.ones((10, 10), np.uint8) mask_with_border = cv2.dilate(mask_normalized, kernel, iterations=1) mask_img = Image.fromarray(mask_with_border, "L") out.append({'name': name0, 'img': mask_img}) allMask = getMasks(out) return allMask def getSegments(yoloModel, img1): classes, image, results1 = getClasses(yoloModel, img1) im = Image.fromarray(image) im.save('classes.jpg') allMask = joinClasses(classes) return allMask @spaces.GPU def getDescript(img1): image_captioner = pipeline("image-to-text", model="Abdou/vit-swin-base-224-gpt2-image-captioning", device='cuda') base64_img = image_to_base64(img1) caption = image_captioner(base64_img)[0]['generated_text'] return caption def rmGPT(caption, remove_class, change): arstr = caption.split(' ') popular = get_most_similar_string(remove_class, arstr) ind = arstr.index(popular) if len(change) < 3: new = [] rng = round(len(arstr) / 5) for i in range(len(arstr)): if i not in list(range(ind - rng, ind + rng)): new.append(arstr[i]) return ' '.join(new) else: arstr[ind] = change return ' '.join(arstr) @spaces.GPU def ChangeOBJ(img1, response, mask1): sdxl = AutoPipelineForInpainting.from_pretrained( "diffusers/stable-diffusion-xl-1.0-inpainting-0.1", torch_dtype=torch.float16 ).to('cuda') size = img1.size image = sdxl(prompt=response, image=img1, mask_image=mask1).images[0] return image.resize((size[0], size[1])) def full_pipeline(image, target, change): img1 = Image.fromarray(image.astype('uint8'), 'RGB') allMask = getSegments(yoloModel, img1) target_to_remove = get_most_similar_string(target, list(allMask.keys())) caption = getDescript(img1) response = rmGPT(caption, target_to_remove, change) mask1 = allMask[target_to_remove] remimg = ChangeOBJ(img1, response, mask1) return remimg, caption, response iface = gr.Interface( fn=full_pipeline, inputs=[ gr.Image(label="Upload Image"), gr.Textbox(label="What to delete?"), gr.Textbox(label="Change?"), ], outputs=[ gr.Image(label="Result Image", type="numpy"), gr.Textbox(label="Caption"), gr.Textbox(label="Message"), ], live=False ) iface.launch(share=True)