bwingenroth commited on
Commit
2d72836
1 Parent(s): 387f9c0

Refactor to also run from command line

Browse files
Files changed (1) hide show
  1. app.py +199 -108
app.py CHANGED
@@ -18,6 +18,7 @@ from pdf2image import convert_from_bytes, convert_from_path
18
 
19
  import re
20
  import requests
 
21
  from urllib.parse import urlparse, parse_qs
22
 
23
  from unilm.dit.object_detection.ditod import add_vit_config
@@ -51,69 +52,128 @@ cfg.MODEL.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
51
  # Step 4: define model
52
  predictor = DefaultPredictor(cfg)
53
 
 
 
 
 
54
 
55
  def analyze_image(img):
56
- md = MetadataCatalog.get(cfg.DATASETS.TEST[0])
57
- if cfg.DATASETS.TEST[0]=='icdar2019_test':
58
- md.set(thing_classes=["table"])
59
- else:
60
- md.set(thing_classes=["text","title","list","table","figure"]) ## these are categories from PubLayNet (PubMed PDF/XML data): https://ieeexplore.ieee.org/document/8977963
61
-
62
- outputs = predictor(img)
63
- instances = outputs["instances"]
64
-
65
- # Ensure we're operating on CPU for numpy compatibility
66
- instances = instances.to("cpu")
67
-
68
- # Filter out figures based on class labels
69
- high_confidence = []
70
- medium_confidence = []
71
- low_confidence = []
72
- for i in range(len(instances)):
73
- if md.thing_classes[instances.pred_classes[i]] == "figure":
74
- box = instances.pred_boxes.tensor[i].numpy().astype(int)
75
- cropped_img = img[box[1]:box[3], box[0]:box[2]]
76
- confidence_score = instances.scores[i].numpy() * 100 # convert to percentage
77
- confidence_text = f"Score: {confidence_score:.2f}%"
78
-
79
- # Overlay confidence score on the image
80
- # Enhanced label visualization with orange color
81
- font_scale = 0.9
82
- font_thickness = 2
83
- text_color = (255, 255, 255) # white background
84
- background_color = (255, 165, 0) # RGB for orange
85
-
86
- (text_width, text_height), _ = cv2.getTextSize(confidence_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness)
87
- padding = 12
88
- text_offset_x = padding - 3
89
- text_offset_y = cropped_img.shape[0] - padding + 2
90
- box_coords = ((text_offset_x, text_offset_y + padding // 2), (text_offset_x + text_width + padding, text_offset_y - text_height - padding // 2))
91
- cv2.rectangle(cropped_img, box_coords[0], box_coords[1], background_color, cv2.FILLED)
92
- cv2.putText(cropped_img, confidence_text, (text_offset_x, text_offset_y), cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, font_thickness)
93
-
94
- # Categorize images based on confidence levels
95
- if confidence_score > 85:
96
- high_confidence.append(cropped_img)
97
- elif confidence_score > 50:
98
- medium_confidence.append(cropped_img)
99
- else:
100
- low_confidence.append(cropped_img)
101
-
102
- v = Visualizer(img[:, :, ::-1], md, scale=1.0, instance_mode=ColorMode.SEGMENTATION)
103
- result_image = v.draw_instance_predictions(instances).get_image()[:, :, ::-1]
104
-
105
- return result_image, high_confidence, medium_confidence, low_confidence
106
- # output = predictor(img)["instances"]
107
- # v = Visualizer(img[:, :, ::-1],
108
- # md,
109
- # scale=1.0,
110
- # instance_mode=ColorMode.SEGMENTATION)
111
- # result = v.draw_instance_predictions(output.to("cpu"))
112
- # result_image = result.get_image()[:, :, ::-1]
113
- #
114
- ## figs = [img[box[1]:box[3], box[0]:box[2]] for box, cls in zip(output.pred_boxes, output.pred_classes) if md.thing_classes[cls] == "figure"]
115
- #
116
- # return result_image, figs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  def handle_input(input_data):
119
  images = []
@@ -121,7 +181,7 @@ def handle_input(input_data):
121
  #input_data is a dict with keys 'text' and 'files'
122
  if 'text' in input_data and input_data['text']:
123
  input_text = input_data['text'].strip()
124
-
125
  # this is either a URL or a PDF ID
126
  if input_text.startswith('http://') or input_text.startswith('https://'):
127
  # Extract the ID from the URL
@@ -164,22 +224,21 @@ def handle_input(input_data):
164
  if not images:
165
  raise ValueError("No valid input provided. Please upload a file or enter a PDF ID.")
166
 
167
- # Assuming processing images returns galleries of images by confidence
168
  return process_images(images)
169
 
170
  def load_image(img_path):
171
  print(f"Loading image: {img_path}")
172
  # Load an image from a file path
173
  image = Image.open(img_path)
 
 
 
 
174
  if isinstance(image, Image.Image):
 
175
  image = np.array(image) # Convert PIL Image to numpy array
176
- # Ensure the image is in the correct format
177
- if image.ndim == 2: # Image is grayscale
178
- image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
179
- elif image.ndim == 3 and image.shape[2] == 3:
180
- image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
181
- # image = image[:, :, ::-1] # Convert RGB to BGR if necessary
182
-
183
  return image
184
 
185
  def construct_download_url(pdf_id):
@@ -236,63 +295,95 @@ def process_images(images):
236
  all_medium_confidence = []
237
  all_low_confidence = []
238
 
 
239
  for img in images:
 
240
  #print("Type of img before processing:", type(img))
241
  #print(f" img before processing: {img}")
242
  processed_images, high_confidence, medium_confidence, low_confidence = analyze_image(img)
243
- all_processed_images.append(processed_images)
 
 
 
 
 
 
 
244
  all_high_confidence.extend(high_confidence)
 
 
 
245
  all_medium_confidence.extend(medium_confidence)
 
 
 
246
  all_low_confidence.extend(low_confidence)
247
 
 
 
 
 
 
 
 
 
248
  return all_processed_images, all_high_confidence, all_medium_confidence, all_low_confidence
249
-
250
  title = "OIDA Image Collection Interactive demo: Document Layout Analysis with DiT and PubLayNet"
251
  description = "<h3>OIDA Demo -- adapted liberally from <a href='https://huggingface.co/spaces/nielsr/dit-document-layout-analysis'>https://huggingface.co/spaces/nielsr/dit-document-layout-analysis</a></h3>Demo for Microsoft's DiT, the Document Image Transformer for state-of-the-art document understanding tasks. This particular model is fine-tuned on PubLayNet, a large dataset for document layout analysis (read more at the links below). To use it, simply upload an image or use the example image below and click 'Submit'. Results will show up in a few seconds. If you want to make the output bigger, right-click on it and select 'Open image in new tab'."
252
  article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2203.02378' target='_blank'>Paper</a> | <a href='https://github.com/microsoft/unilm/tree/master/dit' target='_blank'>Github Repo</a> | <a href='https://huggingface.co/docs/transformers/master/en/model_doc/dit' target='_blank'>HuggingFace doc</a> | <a href='https://ieeexplore.ieee.org/document/8977963' target='_blank'>PubLayNet paper</a></p>"
253
  #examples =[['fpmj0236_Page_012.png'],['fnmf0234_Page_2.png'],['publaynet_example.jpeg'],['fpmj0236_Page_018.png'],['lrpw0232_Page_14.png'],['kllx0250'],['https://www.industrydocuments.ucsf.edu/opioids/docs/#id=yqgg0230']]
254
- examples =[{'files': ['fnmf0234_Page_2.png']},{'files': ['fpmj0236_Page_012.png']},{'files': ['lrpw0232.pdf']},{'text': 'https://www.industrydocuments.ucsf.edu/opioids/docs/#id=yqgg0230'},{'files':['fpmj0236_Page_018.png']},{'files':['lrpw0232_Page_14.png']},{'files':['publaynet_example.jpeg']},{'text':'kllx0250'},{'text':'txhk0255'}]
255
- #txhk0255
 
256
  css = ".output-image, .input-image, .image-preview {height: 600px !important} td.textbox {display:none;} #component-5 .submit-button {display:none;}"
257
 
258
- #iface = gr.Interface(fn=handle_input,
259
- # inputs=gr.MultimodalTextbox(interactive=True,
260
- # label="Upload image/PDF file OR enter OIDA ID or URL",
261
- # file_types=["image",".pdf"],
262
- # placeholder="Upload image/PDF file OR enter OIDA ID or URL"),
263
- # outputs=[gr.Gallery(label="annotated documents"),
264
- # gr.Gallery(label="Figures with High (>85%) Confidence Scores"),
265
- # gr.Gallery(label="Figures with Moderate (50-85%) Confidence Scores"),
266
- # gr.Gallery(label="Figures with Lower Confidence (under 50%) Scores")],
267
- # title=title,
268
- # description=description,
269
- # examples=examples,
270
- # article=article,
271
- # css=css)
272
- ## enable_queue=True)
273
- with gr.Blocks(css=css) as iface:
 
274
  gr.Markdown(f"# {title}")
275
  gr.HTML(description)
276
-
277
  with gr.Row():
278
- with gr.Column():
279
- input = gr.MultimodalTextbox(interactive=True,
280
- label="Upload image/PDF file OR enter OIDA ID or URL",
281
- file_types=["image",".pdf"],
282
- placeholder="Upload image/PDF file OR enter OIDA ID or URL",
283
- submit_btn=None)
284
- submit_btn = gr.Button("Submit")
285
- gr.HTML('<br /><br /><hr />')
286
- gr.Examples(examples, [input])
287
-
288
- with gr.Column():
289
- outputs = [gr.Gallery(label="annotated documents"),
290
- gr.Gallery(label="Figures with High (>85%) Confidence Scores"),
291
- gr.Gallery(label="Figures with Moderate (50-85%) Confidence Scores"),
292
- gr.Gallery(label="Figures with Lower Confidence (under 50%) Scores")]
293
-
294
  with gr.Row():
295
  gr.HTML(article)
296
  submit_btn.click(handle_input, [input], outputs)
297
 
298
- iface.launch(debug=True, auth=[("oida", "OIDA3.1"), ("Brian", "Hi")]) #, cache_examples=True)
 
 
 
 
 
 
 
 
18
 
19
  import re
20
  import requests
21
+ from collections import namedtuple
22
  from urllib.parse import urlparse, parse_qs
23
 
24
  from unilm.dit.object_detection.ditod import add_vit_config
 
52
  # Step 4: define model
53
  predictor = DefaultPredictor(cfg)
54
 
55
+ # Set up internal data structure
56
+ # Define a namedtuple for holding extracted image data
57
+ ExtractedImage = namedtuple("ExtractedImage", ["image", "annotated_page", "original_page", "confidence_score", "top_left", "bottom_right", "num_pixels", "is_color"])
58
+
59
 
60
  def analyze_image(img):
61
+ images = extract_images(img)
62
+
63
+ # Filter out figures based on class labels
64
+ high_confidence = []
65
+ medium_confidence = []
66
+ low_confidence = []
67
+ result_image = img
68
+
69
+ for extracted_image_object in images:
70
+ cropped_img = extracted_image_object.image
71
+ confidence_score = extracted_image_object.confidence_score
72
+ confidence_text = f"Score: {confidence_score:.2f}%"
73
+
74
+ if cropped_img is not None:
75
+ # Overlay confidence score on the image
76
+ # Enhanced label visualization with orange color
77
+ font_scale = 0.9
78
+ font_thickness = 2
79
+ text_color = (255, 255, 255) # white background
80
+ #background_color = (0, 165, 255) # BGR for orange
81
+ background_color = (255, 165, 0) # RGB for orange
82
+
83
+ (text_width, text_height), _ = cv2.getTextSize(confidence_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness)
84
+ padding = 12
85
+ text_offset_x = padding - 3
86
+ text_offset_y = cropped_img.shape[0] - padding + 2
87
+ box_coords = ((text_offset_x, text_offset_y + padding // 2), (text_offset_x + text_width + padding, text_offset_y - text_height - padding // 2))
88
+ cv2.rectangle(cropped_img, box_coords[0], box_coords[1], background_color, cv2.FILLED)
89
+ cv2.putText(cropped_img, confidence_text, (text_offset_x, text_offset_y), cv2.FONT_HERSHEY_SIMPLEX, font_scale, text_color, font_thickness)
90
+ # end adding score annotation
91
+
92
+ #result_image.append(extracted_image_object.annotated_page)
93
+ if extracted_image_object.annotated_page is not None:
94
+ result_image = extracted_image_object.annotated_page
95
+ # Categorize images based on confidence levels
96
+ if confidence_score > 85:
97
+ high_confidence.append(cropped_img)
98
+ elif confidence_score > 50:
99
+ medium_confidence.append(cropped_img)
100
+ elif cropped_img is not None:
101
+ low_confidence.append(cropped_img)
102
+
103
+ return result_image, high_confidence, medium_confidence, low_confidence
104
+
105
+
106
+ def extract_images(img):
107
+ md = MetadataCatalog.get(cfg.DATASETS.TEST[0])
108
+ if cfg.DATASETS.TEST[0]=='icdar2019_test':
109
+ md.set(thing_classes=["table"])
110
+ else:
111
+ md.set(thing_classes=["text","title","list","table","figure"]) ## these are categories from PubLayNet (PubMed PDF/XML data): https://ieeexplore.ieee.org/document/8977963
112
+
113
+ is_color = None
114
+ print(f"###################### Is effectively grayscale? {is_effectively_grayscale_np(img)} #######################")
115
+ print(f"############################### ndim {img.ndim} -- shape[2] {img.shape[2]} #######################")
116
+ # Ensure the image is in the correct format
117
+ if img.ndim == 2: # Image is grayscale, needs converting
118
+ img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
119
+ elif img.ndim == 3 and img.shape[2] == 3:
120
+ if not is_effectively_grayscale_np(img): # Image is RGB mode, but still only using grayscale colors
121
+ img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
122
+ is_color = True
123
+ outputs = predictor(img)
124
+ instances = outputs["instances"]
125
+
126
+ # Ensure we're operating on CPU for numpy compatibility
127
+ instances = instances.to("cpu")
128
+
129
+ extracted_images = []
130
+
131
+ v = Visualizer(img[:, :, ::-1], md, scale=1.0, instance_mode=ColorMode.SEGMENTATION)
132
+ result_image = v.draw_instance_predictions(instances).get_image()[:, :, ::-1]
133
+ result_image = cv2.cvtColor(result_image, cv2.COLOR_BGR2RGB)
134
+
135
+ for i in range(len(instances)):
136
+ if md.thing_classes[instances.pred_classes[i]] == "figure":
137
+ box = instances.pred_boxes.tensor[i].numpy().astype(int)
138
+ cropped_img = img[box[1]:box[3], box[0]:box[2]]
139
+ cropped_img = cv2.cvtColor(cropped_img, cv2.COLOR_BGR2RGB)
140
+ confidence_score = instances.scores[i].numpy() * 100 # convert to percentage
141
+ num_pixels = cropped_img.shape[0] * cropped_img.shape[1]
142
+ is_color = len(cropped_img.shape) == 3 and cropped_img.shape[2] == 3 and not is_effectively_grayscale_np(img)
143
+
144
+ extracted_images.append(ExtractedImage(
145
+ image=cropped_img,
146
+ annotated_page=result_image,
147
+ original_page=img,
148
+ confidence_score=confidence_score,
149
+ top_left=f"{box[0]}-{box[1]}",
150
+ bottom_right=f"{box[2]}-{box[3]}",
151
+ num_pixels=num_pixels,
152
+ is_color=is_color
153
+ ))
154
+
155
+ if not extracted_images: # there were none to process, still need to return basic image
156
+ extracted_images.append(ExtractedImage(
157
+ image=None, # or an appropriate default value
158
+ annotated_page=result_image,
159
+ original_page=img, # The original input image
160
+ confidence_score=-1, # Indicates no confidence
161
+ top_left=None,
162
+ bottom_right=None, # No bounding box coordinates
163
+ num_pixels=0, # No pixels counted
164
+ is_color=False # Default to grayscale or False
165
+ ))
166
+
167
+ return extracted_images
168
+
169
+
170
+ def is_effectively_grayscale_np(array):
171
+ if array.ndim != 3 or array.shape[2] != 3:
172
+ raise ValueError("Input must be an RGB image")
173
+ # Check if all color channels are equal across the image
174
+ r, g, b = array[:,:,0], array[:,:,1], array[:,:,2]
175
+ return np.array_equal(r, g) and np.array_equal(g, b)
176
+
177
 
178
  def handle_input(input_data):
179
  images = []
 
181
  #input_data is a dict with keys 'text' and 'files'
182
  if 'text' in input_data and input_data['text']:
183
  input_text = input_data['text'].strip()
184
+
185
  # this is either a URL or a PDF ID
186
  if input_text.startswith('http://') or input_text.startswith('https://'):
187
  # Extract the ID from the URL
 
224
  if not images:
225
  raise ValueError("No valid input provided. Please upload a file or enter a PDF ID.")
226
 
227
+ # Assuming process_images returns galleries of images by confidence
228
  return process_images(images)
229
 
230
  def load_image(img_path):
231
  print(f"Loading image: {img_path}")
232
  # Load an image from a file path
233
  image = Image.open(img_path)
234
+ print(f" Image mode: {image.mode}") # Add this debug line
235
+ if image.mode != 'RGB':
236
+ print(f" Converting from {image.mode} to RGB")
237
+ image = image.convert('RGB')
238
  if isinstance(image, Image.Image):
239
+ print(" Converting to numpy")
240
  image = np.array(image) # Convert PIL Image to numpy array
241
+ print(f" Array shape: {image.shape}") # Add this debug line
 
 
 
 
 
 
242
  return image
243
 
244
  def construct_download_url(pdf_id):
 
295
  all_medium_confidence = []
296
  all_low_confidence = []
297
 
298
+ idx = 0
299
  for img in images:
300
+ idx += 1
301
  #print("Type of img before processing:", type(img))
302
  #print(f" img before processing: {img}")
303
  processed_images, high_confidence, medium_confidence, low_confidence = analyze_image(img)
304
+ if processed_images is None:
305
+ print(f" ******* processed_images is None on page: {idx}")
306
+ else:
307
+ all_processed_images.append(processed_images)
308
+ print(f" ******* type of processed_images: {type(processed_images)}")
309
+
310
+ if not high_confidence:
311
+ print(f" ******* high_confidence is empty on page: {idx}")
312
  all_high_confidence.extend(high_confidence)
313
+
314
+ if not medium_confidence:
315
+ print(f" ******* medium_confidence is empty on page: {idx}")
316
  all_medium_confidence.extend(medium_confidence)
317
+
318
+ if not low_confidence:
319
+ print(f" ******* low_confidence is empty on page: {idx}")
320
  all_low_confidence.extend(low_confidence)
321
 
322
+ print(f" ******* Size of all_process_images: {len(all_processed_images)}")
323
+ for item in all_processed_images: print(f"Type Check all_processed: {type(item)}")
324
+ print(f" ******* Size of all_high_conf: {len(all_high_confidence)}")
325
+ for item in all_high_confidence: print(f"Type Check high_conf: {type(item)}")
326
+ print(f" ******* Size of all_med: {len(all_medium_confidence)}")
327
+ for item in all_medium_confidence: print(f"Type Check med_conf: {type(item)}")
328
+ print(f" ******* Size of all_low: {len(all_low_confidence)}")
329
+ for item in all_low_confidence: print(f"Type Check low_conf: {type(item)}")
330
  return all_processed_images, all_high_confidence, all_medium_confidence, all_low_confidence
331
+
332
  title = "OIDA Image Collection Interactive demo: Document Layout Analysis with DiT and PubLayNet"
333
  description = "<h3>OIDA Demo -- adapted liberally from <a href='https://huggingface.co/spaces/nielsr/dit-document-layout-analysis'>https://huggingface.co/spaces/nielsr/dit-document-layout-analysis</a></h3>Demo for Microsoft's DiT, the Document Image Transformer for state-of-the-art document understanding tasks. This particular model is fine-tuned on PubLayNet, a large dataset for document layout analysis (read more at the links below). To use it, simply upload an image or use the example image below and click 'Submit'. Results will show up in a few seconds. If you want to make the output bigger, right-click on it and select 'Open image in new tab'."
334
  article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2203.02378' target='_blank'>Paper</a> | <a href='https://github.com/microsoft/unilm/tree/master/dit' target='_blank'>Github Repo</a> | <a href='https://huggingface.co/docs/transformers/master/en/model_doc/dit' target='_blank'>HuggingFace doc</a> | <a href='https://ieeexplore.ieee.org/document/8977963' target='_blank'>PubLayNet paper</a></p>"
335
  #examples =[['fpmj0236_Page_012.png'],['fnmf0234_Page_2.png'],['publaynet_example.jpeg'],['fpmj0236_Page_018.png'],['lrpw0232_Page_14.png'],['kllx0250'],['https://www.industrydocuments.ucsf.edu/opioids/docs/#id=yqgg0230']]
336
+
337
+ examples =[{'files': ['fnmf0234_Page_2.png']},{'files': ['fpmj0236_Page_012.png']},{'files': ['lrpw0232.pdf']},{'text': 'https://www.industrydocuments.ucsf.edu/opioids/docs/#id=yqgg0230'},{'files':['fpmj0236_Page_018.png']},{'files':['lrpw0232_Page_14.png']},{'files':['publaynet_example.jpeg']},{'text':'kllx0250'},{'text':'txhk0255'},{'text':'gpdk0256'}]
338
+
339
  css = ".output-image, .input-image, .image-preview {height: 600px !important} td.textbox {display:none;} #component-5 .submit-button {display:none;}"
340
 
341
+ def setup_gradio_interface():
342
+ #iface = gr.Interface(fn=handle_input,
343
+ # inputs=gr.MultimodalTextbox(interactive=True,
344
+ # label="Upload image/PDF file OR enter OIDA ID or URL",
345
+ # file_types=["image",".pdf"],
346
+ # placeholder="Upload image/PDF file OR enter OIDA ID or URL"),
347
+ # outputs=[gr.Gallery(label="annotated documents"),
348
+ # gr.Gallery(label="Figures with High (>85%) Confidence Scores"),
349
+ # gr.Gallery(label="Figures with Moderate (50-85%) Confidence Scores"),
350
+ # gr.Gallery(label="Figures with Lower Confidence (under 50%) Scores")],
351
+ # title=title,
352
+ # description=description,
353
+ # examples=examples,
354
+ # article=article,
355
+ # css=css)
356
+ ## enable_queue=True)
357
+ with gr.Blocks(css=css) as iface:
358
  gr.Markdown(f"# {title}")
359
  gr.HTML(description)
360
+
361
  with gr.Row():
362
+ with gr.Column():
363
+ input = gr.MultimodalTextbox(interactive=True,
364
+ label="Upload image/PDF file OR enter OIDA ID or URL",
365
+ file_types=["image",".pdf"],
366
+ placeholder="Upload image/PDF file OR enter OIDA ID or URL",
367
+ submit_btn=None)
368
+ submit_btn = gr.Button("Submit")
369
+ gr.HTML('<br /><br /><hr />')
370
+ gr.Examples(examples, [input])
371
+
372
+ with gr.Column():
373
+ outputs = [gr.Gallery(label="annotated documents"),
374
+ gr.Gallery(label="Figures with High (>85%) Confidence Scores"),
375
+ gr.Gallery(label="Figures with Moderate (50-85%) Confidence Scores"),
376
+ gr.Gallery(label="Figures with Lower Confidence (under 50%) Scores")]
377
+
378
  with gr.Row():
379
  gr.HTML(article)
380
  submit_btn.click(handle_input, [input], outputs)
381
 
382
+ return iface
383
+
384
+ def main():
385
+ iface = setup_gradio_interface()
386
+ iface.launch(debug=True, auth=[("oida", "OIDA3.1"), ("Brian", "Hi")]) #, cache_examples=True)
387
+
388
+ if __name__ == "__main__":
389
+ main()