ifmain commited on
Commit
319cffc
1 Parent(s): e09ffb3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +132 -34
app.py CHANGED
@@ -1,19 +1,47 @@
1
- import gradio as gr
2
- from PIL import Image
3
- import numpy as np
4
- import base64
5
  from io import BytesIO
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import difflib
 
 
 
7
  import spaces
8
 
9
- # Assumed available GPU decorator and spaces from Hugging Face
10
- # Utility Functions
 
 
 
 
 
 
 
 
 
 
 
 
11
  def image_to_base64(image: Image.Image):
12
  buffered = BytesIO()
13
  image.save(buffered, format="JPEG")
14
  return base64.b64encode(buffered.getvalue()).decode("utf-8")
15
 
 
16
  def get_most_similar_string(target_string, string_array):
 
17
  best_match = string_array[0]
18
  best_match_ratio = 0
19
  for candidate_string in string_array:
@@ -21,51 +49,121 @@ def get_most_similar_string(target_string, string_array):
21
  if similarity_ratio > best_match_ratio:
22
  best_match = candidate_string
23
  best_match_ratio = similarity_ratio
 
24
  return best_match
25
 
26
- # GPU-Aware Model Loading and Operations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  @spaces.GPU
28
- def load_models_and_process(image: Image.Image, target: str):
29
- from ultralytics import YOLO
30
- from diffusers import AutoPipelineForInpainting
31
- from transformers import pipeline
32
- import torch
 
 
 
33
 
34
- # Model loading
35
- yolo_model = YOLO('yolov8x-seg.pt')
36
- sdxl = AutoPipelineForInpainting.from_pretrained("diffusers/stable-diffusion-xl-1.0-inpainting-0.1", torch_dtype=torch.float16).to("cuda")
37
- image_captioner = pipeline("image-to-text", model="Abdou/vit-swin-base-224-gpt2-image-captioning", device=0)
38
 
39
- # Placeholder function calls (e.g., getSegments, getDescript)
40
- # Implement the actual logic here as needed
 
 
 
 
 
 
 
41
 
42
- # For demonstration, we return a placeholder for result_image, caption, response
43
- result_image = image # Placeholder: Implement actual image processing
44
- caption = "This is a sample caption." # Placeholder: Use `image_captioner` as needed
45
- response = "Sample response based on processing." # Placeholder: Construct response from processing
46
 
47
- return result_image, caption, response
 
 
 
 
 
48
 
49
- # Gradio Interface
50
  def full_pipeline(image, target):
51
- result_image, caption, response = load_models_and_process(image, target)
52
- # Convert PIL image to numpy array for Gradio output
53
- if isinstance(result_image, Image.Image):
54
- result_image = np.array(result_image)
55
- return result_image, caption, response
 
 
 
 
 
 
 
56
 
57
  iface = gr.Interface(
58
- fn=full_pipeline,
59
  inputs=[
60
- gr.Image(label="Upload Image", type='pil'),
61
  gr.Textbox(label="What to delete?"),
62
- ],
63
  outputs=[
64
- gr.Image(label="Result Image"),
65
  gr.Textbox(label="Caption"),
66
  gr.Textbox(label="Message"),
67
  ],
68
  live=False
69
  )
70
 
71
- iface.launch()
 
1
+ # Standard Libraries
2
+ import time
 
 
3
  from io import BytesIO
4
+ import base64
5
+
6
+ # Data Handling and Image Processing
7
+ import numpy as np
8
+ from PIL import Image
9
+
10
+ # Machine Learning and AI Models
11
+ import torch
12
+ from transformers import pipeline
13
+ from diffusers import AutoPipelineForInpainting
14
+ from ultralytics import YOLO
15
+
16
+ # Text and Data Manipulation
17
  import difflib
18
+
19
+ # UI and Application Framework
20
+ import gradio as gr
21
  import spaces
22
 
23
+
24
+ # Constants
25
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
26
+
27
+ # Load
28
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
29
+ yoloModel = YOLO('yolov8x-seg.pt')
30
+ sdxl = AutoPipelineForInpainting.from_pretrained(
31
+ "diffusers/stable-diffusion-xl-1.0-inpainting-0.1",
32
+ torch_dtype=torch.float32
33
+ ).to(DEVICE)
34
+ image_captioner = pipeline("image-to-text", model="Abdou/vit-swin-base-224-gpt2-image-captioning", device=DEVICE)
35
+
36
+
37
  def image_to_base64(image: Image.Image):
38
  buffered = BytesIO()
39
  image.save(buffered, format="JPEG")
40
  return base64.b64encode(buffered.getvalue()).decode("utf-8")
41
 
42
+
43
  def get_most_similar_string(target_string, string_array):
44
+ differ = difflib.Differ()
45
  best_match = string_array[0]
46
  best_match_ratio = 0
47
  for candidate_string in string_array:
 
49
  if similarity_ratio > best_match_ratio:
50
  best_match = candidate_string
51
  best_match_ratio = similarity_ratio
52
+
53
  return best_match
54
 
55
+
56
+ # Yolo
57
+ @spaces.GPU
58
+ def getClasses(model, img1):
59
+ results = model([img1])
60
+ out = []
61
+ for r in results:
62
+ im_array = r.plot()
63
+ out.append(r)
64
+
65
+ return r, im_array[..., ::-1], results
66
+
67
+
68
+ def getMasks(out):
69
+ allout = {}
70
+ class_masks = {}
71
+ for a in out:
72
+ class_name = a['name']
73
+ mask = a['img']
74
+ if class_name in class_masks:
75
+ class_masks[class_name] = Image.fromarray(
76
+ np.maximum(np.array(class_masks[class_name]), np.array(mask))
77
+ )
78
+ else:
79
+ class_masks[class_name] = mask
80
+ for class_name, mask in class_masks.items():
81
+ allout[class_name] = mask
82
+ return allout
83
+
84
+
85
+ def joinClasses(classes):
86
+ i = 0
87
+ out = []
88
+ for r in classes:
89
+ masks = r.masks
90
+ name0 = r.names[int(r.boxes.cls.cpu().numpy()[0])]
91
+
92
+ mask1 = masks[0]
93
+ mask = mask1.data[0].cpu().numpy()
94
+ polygon = mask1.xy[0]
95
+ # Normalize the mask values to 0-255 if needed
96
+ mask_normalized = ((mask - mask.min()) * (255 / (mask.max() - mask.min()))).astype(np.uint8)
97
+ mask_img = Image.fromarray(mask_normalized, "L")
98
+ out.append({'name': name0, 'img': mask_img})
99
+ i += 1
100
+
101
+ allMask = getMasks(out)
102
+ return allMask
103
+
104
+
105
+ def getSegments(yoloModel, img1):
106
+ classes, image, results1 = getClasses(yoloModel, img1)
107
+ allMask = joinClasses(classes)
108
+ return allMask
109
+
110
+
111
+ # Gradio UI
112
  @spaces.GPU
113
+ def captionMaker(base64_img):
114
+ return image_captioner(base64_img)[0]['generated_text']
115
+
116
+
117
+ def getDescript(image_captioner, img1):
118
+ base64_img = image_to_base64(img1)
119
+ caption = captionMaker(base64_img)
120
+ return caption
121
 
 
 
 
 
122
 
123
+ def rmGPT(caption, remove_class):
124
+ arstr = caption.split(' ')
125
+ popular = get_most_similar_string(remove_class, arstr)
126
+ ind = arstr.index(popular)
127
+ new = []
128
+ for i in range(len(arstr)):
129
+ if i not in list(range(ind - 2, ind + 3)):
130
+ new.append(arstr[i])
131
+ return ' '.join(new)
132
 
 
 
 
 
133
 
134
+ @spaces.GPU
135
+ def ChangeOBJ(sdxl_m, img1, response, mask1):
136
+ size = img1.size
137
+ image = sdxl_m(prompt=response, image=img1, mask_image=mask1).images[0]
138
+ return image.resize((size[0], size[1]))
139
+
140
 
 
141
  def full_pipeline(image, target):
142
+ img1 = Image.fromarray(image.astype('uint8'), 'RGB')
143
+ allMask = getSegments(yoloModel, img1)
144
+ tartget_to_remove = get_most_similar_string(target, list(allMask.keys()))
145
+ caption = getDescript(image_captioner, img1)
146
+
147
+ response = rmGPT(caption, tartget_to_remove)
148
+ mask1 = allMask[tartget_to_remove]
149
+
150
+ remimg = ChangeOBJ(sdxl, img1, response, mask1)
151
+
152
+ return remimg, caption, response
153
+
154
 
155
  iface = gr.Interface(
156
+ fn=full_pipeline,
157
  inputs=[
158
+ gr.Image(label="Upload Image"),
159
  gr.Textbox(label="What to delete?"),
160
+ ],
161
  outputs=[
162
+ gr.Image(label="Result Image", type="numpy"),
163
  gr.Textbox(label="Caption"),
164
  gr.Textbox(label="Message"),
165
  ],
166
  live=False
167
  )
168
 
169
+ iface.launch()