sxandie commited on
Commit
99bf727
1 Parent(s): 50a6157

creating app.py

Browse files
Files changed (1) hide show
  1. app.py +869 -0
app.py ADDED
@@ -0,0 +1,869 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.system('pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu')
3
+ import glob, fitz
4
+ import PIL
5
+ import re
6
+ import torch
7
+ import cv2
8
+ import pytesseract
9
+ import pandas as pd
10
+ import numpy as np
11
+ import gradio as gr
12
+ from PIL import Image
13
+ from tqdm import tqdm
14
+ from difflib import SequenceMatcher
15
+ from itertools import groupby
16
+ from scipy import ndimage
17
+ from scipy.ndimage import interpolation as inter
18
+ from datasets import load_metric
19
+ from datasets import load_dataset
20
+ from datasets.features import ClassLabel
21
+ from transformers import AutoProcessor
22
+ from PIL import Image, ImageDraw, ImageFont
23
+ from transformers import AutoModelForTokenClassification
24
+ from transformers.data.data_collator import default_data_collator
25
+ from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D
26
+ from transformers import LayoutLMv3ForTokenClassification,LayoutLMv3FeatureExtractor,LayoutLMv3ImageProcessor
27
+ import io
28
+ # import paddleocr
29
+ # from paddleocr import PaddleOCR
30
+ auth_token = os.environ.get("HUGGING_FACE_HUB_TOKEN")
31
+ import warnings
32
+ # Ignore warning messages
33
+ warnings.filterwarnings("ignore")
34
+
35
+ id2label= {0: 'others', 1: 'issuer_name', 2: 'issuer_addr', 3: 'issuer_cap', 4: 'issuer_city', 5: 'issuer_prov', 6: 'issuer_state', 7: 'issuer_tel', 8: 'issuer_id', 9: 'issuer_fax', 10: 'issuer_vat', 11: 'issuer_contact', 12: 'issuer_contact_email', 13: 'issuer_contact_phone', 14: 'receiver_name', 15: 'receiver_addr', 16: 'receiver_cap', 17: 'receiver_city', 18: 'receiver_prov', 19: 'receiver_state', 20: 'receiver_tel', 21: 'receiver_fax', 22: 'receiver_vat', 23: 'receiver_id', 24: 'receiver_contact', 25: 'dest_name', 26: 'dest_addr', 27: 'dest_cap', 28: 'dest_city', 29: 'dest_prov', 30: 'dest_state', 31: 'dest_tel', 32: 'dest_fax', 33: 'dest_vat', 34: 'doc_type', 35: 'doc_nr', 36: 'doc_date', 37: 'order_nr', 38: 'order_date', 39: 'service_order', 40: 'shipment_nr', 41: 'client_reference', 42: 'client_vat', 43: 'client_id', 44: 'client_code', 45: 'time', 46: 'notes', 47: 'client_tel', 48: 'art_code', 49: 'ref_code', 50: 'order_reason', 51: 'order_ref', 52: 'order_ref_date', 53: 'detail_desc', 54: 'lot_id', 55: 'lot_qty', 56: 'detail_um', 57: 'detail_qty', 58: 'detail_tare', 59: 'detail_grossw', 60: 'detail_packages', 61: 'detail_netw', 62: 'detail_origin', 63: 'payment_bank', 64: 'payment_terms', 65: 'tot_qty', 66: 'tot_grossw', 67: 'tot_netw', 68: 'tot_volume', 69: 'shipment_reason', 70: 'package_type', 71: 'transport_respons', 72: 'transport_vectors', 73: 'transport_terms', 74: 'transport_datetime', 75: 'return_plt', 76: 'nonreturn_plt', 77: 'dest_signature', 78: 'driver_signature', 79: 'transport_signature', 80: 'page', 81: 'varieta', 82: 'raccolta', 83: 'detail_volume'}
36
+ custom_config = r'--oem 3 --psm 6'
37
+ lang='eng'
38
+
39
+
40
+ #Google Vision OCR
41
+ from google.cloud import vision_v1p3beta1 as vision
42
+ from google.cloud import vision_v1p3beta1 as vision
43
+ from google.cloud import vision
44
+ # os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "test-apikey.json"
45
+
46
+ processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
47
+ model = AutoModelForTokenClassification.from_pretrained("sxandie/doc-ai-information-extraction",use_auth_token=auth_token)
48
+
49
+ from tabulate import tabulate
50
+ def print_df(df):
51
+ print(tabulate(df, headers = df.columns, tablefmt = 'psql'))
52
+
53
+
54
+ def process_image_pytesseract(image,width,height):
55
+ width, height = image.size
56
+ feature_extractor = LayoutLMv3ImageProcessor(apply_ocr=True,lang=lang)
57
+ encoding_feature_extractor = feature_extractor(image, return_tensors="pt",truncation=True)
58
+ words, boxes = encoding_feature_extractor.words, encoding_feature_extractor.boxes
59
+ return words,boxes
60
+
61
+ def create_bounding_box5(vertices, width_scale, height_scale):
62
+
63
+ # Get the x, y coordinates
64
+ x1 = int(vertices[0].x * width_scale)
65
+ y1 = int(vertices[0].y * height_scale)
66
+
67
+ x2 = int(vertices[2].x * width_scale)
68
+ y2 = int(vertices[2].y * height_scale)
69
+
70
+ # Validate x1 < x2
71
+ if x1 > x2:
72
+ x1, x2 = x2, x1
73
+
74
+ # Validate y1 < y2
75
+ if y1 > y2:
76
+ y1, y2 = y2, y1
77
+
78
+ # Return valid bounding box
79
+ return [x1, y1, x2, y2]
80
+
81
+ #Google Vision OCR
82
+ def process_image_GoogleVision(image, width, height):
83
+ inference_image = [image.convert("RGB")]
84
+ client = vision.ImageAnnotatorClient()
85
+ with io.BytesIO() as output:
86
+ image.save(output, format='JPEG')
87
+ content = output.getvalue()
88
+ image = vision.Image(content=content)
89
+
90
+ response = client.text_detection(image=image)
91
+ texts = response.text_annotations
92
+
93
+ # Get the bounding box vertices and remove the first item
94
+ bboxes = [text.bounding_poly.vertices[1:] for text in texts]
95
+ # Create the list of words and boxes
96
+ words = [text.description for text in texts]
97
+ boxes = [create_bounding_box5(bbox, 1000/width, 1000/height) for bbox in bboxes]
98
+ return words,boxes
99
+
100
+
101
+ def generate_unique_colors(id2label):
102
+ # Generate unique colors
103
+ label_ints = np.random.choice(len(PIL.ImageColor.colormap), len(id2label), replace=False)
104
+ label_color_pil = list(PIL.ImageColor.colormap.values())
105
+ label_color = [label_color_pil[i] for i in label_ints]
106
+
107
+ color = {}
108
+ for k, v in id2label.items():
109
+ if v[:2] == '':
110
+ color['o'] = label_color[k]
111
+ else:
112
+ color[v[0:]] = label_color[k]
113
+
114
+ return color
115
+
116
+ def create_bounding_box1(bbox_data, width_scale: float, height_scale: float):
117
+ xs = []
118
+ ys = []
119
+ for x, y in bbox_data:
120
+ xs.append(x)
121
+ ys.append(y)
122
+
123
+ left = int(max(0, min(xs) * width_scale))
124
+ top = int(max(0, min(ys) * height_scale))
125
+ right = int(min(1000, max(xs) * width_scale))
126
+ bottom = int(min(1000, max(ys) * height_scale))
127
+
128
+ return [left, top, right, bottom]
129
+
130
+
131
+
132
+ def unnormalize_box(bbox, width, height):
133
+ return [
134
+ width * (bbox[0] / 1000),
135
+ height * (bbox[1] / 1000),
136
+ width * (bbox[2] / 1000),
137
+ height * (bbox[3] / 1000),
138
+ ]
139
+
140
+
141
+ def iob_to_label(label):
142
+ return id2label.get(label, 'others')
143
+
144
+ def process_image(image):
145
+ custom_config = r'--oem 3 --psm 6'
146
+ # lang='eng+deu+ita+chi_sim'
147
+ lang='eng'
148
+ width, height = image.size
149
+ feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=True)
150
+ encoding_feature_extractor = feature_extractor(image, return_tensors="pt",truncation=True)
151
+ words, boxes = encoding_feature_extractor.words, encoding_feature_extractor.boxes
152
+
153
+ custom_config = r'--oem 3 --psm 6'
154
+ # encode
155
+ inference_image = [image.convert("RGB")]
156
+ encoding = processor(inference_image , truncation=True, return_offsets_mapping=True, return_tensors="pt", padding="max_length", stride =128, max_length=512, return_overflowing_tokens=True)
157
+ offset_mapping = encoding.pop('offset_mapping')
158
+ overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping')
159
+
160
+ # change the shape of pixel values
161
+ x = []
162
+ for i in range(0, len(encoding['pixel_values'])):
163
+ x.append(encoding['pixel_values'][i])
164
+ x = torch.stack(x)
165
+ encoding['pixel_values'] = x
166
+
167
+ # forward pass
168
+ outputs = model(**encoding)
169
+
170
+ # get predictions
171
+ predictions = outputs.logits.argmax(-1).squeeze().tolist()
172
+ token_boxes = encoding.bbox.squeeze().tolist()
173
+
174
+ # only keep non-subword predictions
175
+ preds = []
176
+ l_words = []
177
+ bboxes = []
178
+ token_section_num = []
179
+
180
+ if (len(token_boxes) == 512):
181
+ predictions = [predictions]
182
+ token_boxes = [token_boxes]
183
+
184
+
185
+ for i in range(0, len(token_boxes)):
186
+ for j in range(0, len(token_boxes[i])):
187
+ unnormal_box = unnormalize_box(token_boxes[i][j], width, height)
188
+ if (np.asarray(token_boxes[i][j]).shape != (4,)):
189
+ continue
190
+ elif (token_boxes[i][j] == [0, 0, 0, 0] or token_boxes[i][j] == 0):
191
+ #print('zero found!')
192
+ continue
193
+ # if bbox is available in the list, just we need to update text
194
+ elif (unnormal_box not in bboxes):
195
+ preds.append(predictions[i][j])
196
+ l_words.append(processor.tokenizer.decode(encoding["input_ids"][i][j]))
197
+ bboxes.append(unnormal_box)
198
+ token_section_num.append(i)
199
+ else:
200
+ # we have to update the word
201
+ _index = bboxes.index(unnormal_box)
202
+ if (token_section_num[_index] == i):
203
+ # check if they're in a same section or not (documents with more than 512 tokens will divide to seperate
204
+ # parts, so it's possible to have a word in both of the pages and we have to control that repetetive words
205
+ # HERE: because they're in a same section, so we can merge them safely
206
+ l_words[_index] = l_words[_index] + processor.tokenizer.decode(encoding["input_ids"][i][j])
207
+
208
+ else:
209
+ continue
210
+
211
+ return bboxes, preds, l_words, image
212
+
213
+
214
+
215
+ def process_image_encoding(model, processor, image, words, boxes,width,height):
216
+ # encode
217
+ inference_image = [image.convert("RGB")]
218
+ encoding = processor(inference_image ,words,boxes=boxes, truncation=True, return_offsets_mapping=True, return_tensors="pt",
219
+ padding="max_length", stride =128, max_length=512, return_overflowing_tokens=True)
220
+ offset_mapping = encoding.pop('offset_mapping')
221
+ overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping')
222
+
223
+ # change the shape of pixel values
224
+ x = []
225
+ for i in range(0, len(encoding['pixel_values'])):
226
+ x.append(encoding['pixel_values'][i])
227
+ x = torch.stack(x)
228
+ encoding['pixel_values'] = x
229
+
230
+ # forward pass
231
+ outputs = model(**encoding)
232
+
233
+ # get predictions
234
+ predictions = outputs.logits.argmax(-1).squeeze().tolist()
235
+ token_boxes = encoding.bbox.squeeze().tolist()
236
+
237
+ # only keep non-subword predictions
238
+ preds = []
239
+ l_words = []
240
+ bboxes = []
241
+ token_section_num = []
242
+
243
+ if (len(token_boxes) == 512):
244
+ predictions = [predictions]
245
+ token_boxes = [token_boxes]
246
+
247
+ for i in range(0, len(token_boxes)):
248
+ for j in range(0, len(token_boxes[i])):
249
+ unnormal_box = unnormalize_box(token_boxes[i][j], width, height)
250
+ if (np.asarray(token_boxes[i][j]).shape != (4,)):
251
+ continue
252
+ elif (token_boxes[i][j] == [0, 0, 0, 0] or token_boxes[i][j] == 0):
253
+ #print('zero found!')
254
+ continue
255
+ # if bbox is available in the list, just we need to update text
256
+ elif (unnormal_box not in bboxes):
257
+ preds.append(predictions[i][j])
258
+ l_words.append(processor.tokenizer.decode(encoding["input_ids"][i][j]))
259
+ bboxes.append(unnormal_box)
260
+ token_section_num.append(i)
261
+ else:
262
+ # we have to update the word
263
+ _index = bboxes.index(unnormal_box)
264
+ if (token_section_num[_index] == i):
265
+ # check if they're in a same section or not (documents with more than 512 tokens will divide to seperate
266
+ # parts, so it's possible to have a word in both of the pages and we have to control that repetetive words
267
+ # HERE: because they're in a same section, so we can merge them safely
268
+ l_words[_index] = l_words[_index] + processor.tokenizer.decode(encoding["input_ids"][i][j])
269
+ else:
270
+ continue
271
+
272
+ return bboxes, preds, l_words, image
273
+
274
+
275
+ def process_form_(json_df):
276
+
277
+ labels = [x['LABEL'] for x in json_df]
278
+ texts = [x['TEXT'] for x in json_df]
279
+ cmb_list = []
280
+ for i, j in enumerate(labels):
281
+ cmb_list.append([labels[i], texts[i]])
282
+
283
+ grouper = lambda l: [[k] + sum((v[1::] for v in vs), []) for k, vs in groupby(l, lambda x: x[0])]
284
+
285
+ list_final = grouper(cmb_list)
286
+ lst_final = []
287
+ for x in list_final:
288
+ json_dict = {}
289
+ json_dict[x[0]] = (' ').join(x[1:])
290
+ lst_final.append(json_dict)
291
+
292
+ return lst_final
293
+
294
+
295
+ def createExcel(maindf, detailsdf, pdffile):
296
+ outputPath = f'{pdffile}.xlsx'
297
+ with pd.ExcelWriter(outputPath, engine='xlsxwriter') as writer:
298
+ maindf.to_excel(writer, sheet_name='headers', index=False)
299
+ detailsdf.to_excel(writer, sheet_name='details', index=False)
300
+ worksheet1 = writer.sheets["headers"]
301
+ for idx, col in enumerate(maindf):
302
+ series = maindf[col]
303
+ max_len = max((
304
+ series.astype(str).map(len).max(),
305
+ len(str(series.name))
306
+ )) + 1
307
+ worksheet1.set_column(idx, idx, max_len)
308
+ worksheet2 = writer.sheets["details"]
309
+ for idx, col in enumerate(detailsdf):
310
+ series = detailsdf[col]
311
+ max_len = max((
312
+ series.astype(str).map(len).max(),
313
+ len(str(series.name))
314
+ )) + 1
315
+ worksheet2.set_column(idx, idx, max_len)
316
+ return outputPath
317
+
318
+
319
+ def visualize_image(final_bbox, final_preds, l_words, image,label2color):
320
+
321
+ draw = ImageDraw.Draw(image)
322
+ font = ImageFont.load_default()
323
+ json_df = []
324
+
325
+ for ix, (prediction, box) in enumerate(zip(final_preds, final_bbox)):
326
+ if prediction is not None:
327
+ predicted_label = iob_to_label(prediction).lower()
328
+ if predicted_label not in ["others"]:
329
+ draw.rectangle(box, outline=label2color[predicted_label])
330
+ draw.text((box[0]+10, box[1]-10), text=predicted_label, fill=label2color[predicted_label], font=font)
331
+ json_dict = {}
332
+ json_dict['TEXT'] = l_words[ix]
333
+ json_dict['LABEL'] = label2color[predicted_label]
334
+ json_df.append(json_dict)
335
+ return image, json_df
336
+
337
+ def rotate_image(image):
338
+ extracted_text = pytesseract.image_to_string(image)
339
+ # check if the image contains any text
340
+ if not extracted_text:
341
+ print("The image does not contain any text.")
342
+ return None
343
+ elif extracted_text.isspace():
344
+ print("The image contains only spaces.")
345
+ return None
346
+ text = pytesseract.image_to_osd(image)
347
+ angle = int(re.search('(?<=Rotate: )\d+', text).group(0))
348
+ angle = 360 - angle
349
+ rotated = ndimage.rotate(image, angle)
350
+ data = Image.fromarray(rotated)
351
+ return data
352
+
353
+
354
+ # correct the skewness of images
355
+ def correct_skew(image, delta=1, limit=5):
356
+ def determine_score(arr, angle):
357
+ data = inter.rotate(arr, angle, reshape=False, order=0)
358
+ histogram = np.sum(data, axis=1, dtype=float)
359
+ score = np.sum((histogram[1:] - histogram[:-1]) ** 2, dtype=float)
360
+ return histogram, score
361
+
362
+ # Convert the PIL Image object to a numpy array
363
+ image = np.asarray(image.convert('L'), dtype=np.uint8)
364
+
365
+ # Apply thresholding
366
+ thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
367
+
368
+ scores = []
369
+ angles = np.arange(-limit, limit + delta, delta)
370
+ for angle in angles:
371
+ histogram, score = determine_score(thresh, angle)
372
+ scores.append(score)
373
+ best_angle = angles[scores.index(max(scores))]
374
+
375
+ (h, w) = image.shape[:2]
376
+ center = (w // 2, h // 2)
377
+ M = cv2.getRotationMatrix2D(center, best_angle, 1.0)
378
+ corrected = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, \
379
+ borderMode=cv2.BORDER_REPLICATE)
380
+ return best_angle, corrected
381
+
382
+
383
+ def removeBorders(img):
384
+ result = img.copy()
385
+
386
+ if len(result.shape) == 2:
387
+ # if the input image is grayscale, convert it to BGR format
388
+ result = cv2.cvtColor(result, cv2.COLOR_GRAY2BGR)
389
+
390
+ gray = cv2.cvtColor(result, cv2.COLOR_BGR2GRAY) # convert to grayscale
391
+ thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
392
+
393
+ # Remove horizontal lines
394
+ horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40,1))
395
+ remove_horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
396
+ cnts = cv2.findContours(remove_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
397
+ cnts = cnts[0] if len(cnts) == 2 else cnts[1]
398
+ for c in cnts:
399
+ cv2.drawContours(result, [c], -1, (255,255,255), 5)
400
+
401
+ # Remove vertical lines
402
+ vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,40))
403
+ remove_vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
404
+ cnts = cv2.findContours(remove_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
405
+ cnts = cnts[0] if len(cnts) == 2 else cnts[1]
406
+ for c in cnts:
407
+ cv2.drawContours(result, [c], -1, (255,255,255), 5)
408
+
409
+ return result
410
+
411
+ def color2label_except(label2color, excluded_labels):
412
+ """
413
+ Inversely maps colors to labels based on the provided label2color dictionary,
414
+ excluding the specified labels.
415
+
416
+ Args:
417
+ label2color (dict): Dictionary mapping labels to colors.
418
+ excluded_labels (list): List of labels to exclude.
419
+
420
+ Returns:
421
+ dict: Dictionary mapping colors to labels, excluding the specified labels.
422
+ """
423
+ # Filter out excluded labels from label2color dictionary
424
+ filtered_label2color = {label: color for label, color in label2color.items() if label not in excluded_labels}
425
+
426
+ # Invert the filtered label2color dictionary to create color2label mapping
427
+ return {v: k for k, v in filtered_label2color.items()}
428
+
429
+
430
+ def add_dataframe(df_main,labels_repeating,label2color):
431
+ col_name_map =color2label_except(label2color,labels_repeating)
432
+
433
+ columns = list(col_name_map.values())
434
+ data = {col:[] for col in columns}
435
+ for i in df_main:
436
+ for k, v in i.items():
437
+ if k in col_name_map:
438
+ data[col_name_map[k]].append(v)
439
+
440
+ # join the list of strings for each column and convert to a dataframe
441
+ for col in columns:
442
+ data[col] = [' '.join(data[col])]
443
+ df_upper = pd.DataFrame(data)
444
+ key_value_pairs = []
445
+ for col in df_upper.columns:
446
+ key_value_pairs.append({'key': col, 'value': df_upper[col][0]})
447
+ df_key_value = pd.DataFrame(key_value_pairs)
448
+ # Extract the value from the containertype column
449
+ # container_quantity = int(df_key_value[df_key_value['key'] == 'containertype']['value'].str.split("x").str[0])
450
+
451
+ # # Add a new row to the DataFrame
452
+ # df_key_value = df_key_value.append({'key': 'containerquantity', 'value': container_quantity}, ignore_index=True)
453
+
454
+ # # Extract the desired value from the containertype column
455
+ # df_key_value.loc[df_key_value['key'] == 'containertype', 'value'] = df_key_value.loc[df_key_value['key'] == 'containertype', 'value'].str.split("x").str[1]
456
+ return df_key_value
457
+
458
+
459
+ import statistics
460
+
461
+ def id2label_row(s, id2label):
462
+ if s in id2label.values():
463
+ return s
464
+ return id2label[s]
465
+
466
+ def dist_height(y1,y2):
467
+ return abs(int(y1)- int(y2))
468
+
469
+
470
+ def mergeBoxes(df):
471
+ xmin, ymin, xmax, ymax = [], [], [], []
472
+ for i in range(df.shape[0]):
473
+ box = df['bbox_column'].iloc[i]
474
+ xmin.append(box[0])
475
+ ymin.append(box[1])
476
+ xmax.append(box[2])
477
+ ymax.append(box[3])
478
+ return [min(xmin), min(ymin), max(xmax), max(ymax)]
479
+
480
+
481
+ def transform_dataset(df, merge_labels):
482
+ df_temp = df.iloc[merge_labels] # a duplicate df with only concerned rows
483
+ df_temp.reset_index(drop = True, inplace = True)
484
+ text = ' '.join(df_temp['scr_column'])
485
+ bbox = mergeBoxes(df_temp)
486
+ retain_index = merge_labels[0] #the first index is parent row
487
+ df['scr_column'].iloc[retain_index] = text
488
+ df['bbox_column'].iloc[retain_index] = bbox
489
+ # keeping the first & removing rest
490
+ df = df.loc[~df.index.isin(merge_labels[1:]), :]
491
+ df.reset_index(drop = True, inplace = True)
492
+ return df
493
+
494
+
495
+ def box_overlap(box1, box2, horizontal_vertical):
496
+ # Extract coordinates of box1
497
+ x1_box1, y1_box1, x2_box1, y2_box1 = box1
498
+ # Extract coordinates of box2
499
+ x1_box2, y1_box2, x2_box2, y2_box2 = box2
500
+
501
+ # Check if boxes overlap horizontally and vertically
502
+ if horizontal_vertical == "H":
503
+ if x1_box1 <= x2_box2 and x2_box1 >= x1_box2:
504
+ return True
505
+ else:
506
+ return False
507
+ if horizontal_vertical == "V":
508
+ if y1_box1 <= y2_box2 and y2_box1 >= y1_box2:
509
+ return True
510
+ else:
511
+ return False
512
+
513
+
514
+ def horizonatal_merging(df, font_length, perform_overlapping =False, x_change = 0, y_change = 0):
515
+ fat_df = df.copy()
516
+ for i in range(df.shape[0]):
517
+ box = fat_df['bbox_column'].iloc[i]
518
+ fat_df['bbox_column'].iloc[i] = [box[0]-x_change, box[1]-y_change, box[2]+x_change, box[3] + y_change]
519
+ if perform_overlapping == True:
520
+ redundant_rows = []
521
+ for i in range(fat_df.shape[0]):
522
+ box_i = fat_df.bbox_column[i]
523
+ indices2merge = []
524
+
525
+ for j in range(i+1, fat_df.shape[0]):
526
+ if fat_df.preds_column[j] == fat_df.preds_column[i]: # if labels are same
527
+ box_j = fat_df.bbox_column[j]
528
+ if abs(box_i[1]-box_j[3])<font_length*1.5: # if the boxes are at height within 50% more range of font size
529
+ # Check if boxes overlap horizontally
530
+ if box_overlap(box_i, box_j, 'H'):
531
+ indices2merge.append(j)
532
+ df.scr_column[i] += df.scr_column[j]
533
+ box_i = fat_df.bbox_column[j] # finding the next connected word
534
+
535
+ #once we have all indices that belong to a particular category
536
+ # merging the boundong boxes, keeping them in 1st note/row.
537
+ if len(indices2merge)!=0:
538
+ df['bbox_column'].iloc[i] = mergeBoxes(df.loc[indices2merge])
539
+ redundant_rows.extend(indices2merge)
540
+
541
+ #now since all the transformation is done, lets remove the redundant rows
542
+ return df.drop(redundant_rows)
543
+
544
+
545
+ def mergeLabelsExtensive_repeating(df_grouped, repeating_label):
546
+ df_grouped.reset_index(inplace = True, drop = True)
547
+ # this function merges same label entities together in a single instance.
548
+ df_grouped = df_grouped[df_grouped['preds_column'].isin(repeating_label)]
549
+ font_length =0
550
+ count = 0
551
+ while count<5 and count<df_grouped.shape[0]:
552
+ box_i = df_grouped['bbox_column'].iloc[count] # box of current label contains [x1,y1,x3,y3]
553
+ font_length += box_i[3]-box_i[1]
554
+ count +=1
555
+ font_length = font_length/5
556
+
557
+ df_grouped = horizonatal_merging(df_grouped, font_length, True, 30, 0)
558
+ return df_grouped
559
+
560
+
561
+
562
+ def group_labels_wrt_height(df):
563
+ """
564
+ This function groups the labels based on the height of the bounding box.
565
+ """
566
+ #sorting the lines based on heights using column 'y_axis'
567
+ df = df.sort_values(by='y_axis')
568
+ df.reset_index(inplace = True, drop = True)
569
+ print("entering: group_labels_wrt_height ")
570
+
571
+ final_yaxis = []
572
+ final_scr = []
573
+ final_pred = []
574
+
575
+ current_group = []
576
+ current_scr = []
577
+ current_pred = []
578
+
579
+
580
+ # Iterate through the column values
581
+ for i, (value,scr,preds ) in enumerate(zip(df['y_axis'], df['scr_column'], df['preds_column'])):
582
+ if i == 0:
583
+ # Start a new group with the first value
584
+ current_group.append(value)
585
+ current_scr.append(scr)
586
+ current_pred.append(preds)
587
+ else:
588
+ # Check if the difference between the current value and the previous value is <= 20
589
+ if abs(value - df['y_axis'][i - 1]) <= 35:
590
+ # Add the value to the current group
591
+ current_group.append(value)
592
+ current_scr.append(scr)
593
+ current_pred.append(preds)
594
+ else:
595
+ # Start a new group with the current value
596
+ final_yaxis.append(current_group)
597
+ final_scr.append(current_scr)
598
+ final_pred.append(current_pred)
599
+
600
+ current_group = [value]
601
+ current_scr = [scr]
602
+ current_pred = [preds]
603
+
604
+
605
+ # Add the last group
606
+ final_yaxis.append(current_group)
607
+ final_scr.append(current_scr)
608
+ final_pred.append(current_pred)
609
+
610
+ final_grouped_df = pd.DataFrame({'y_axis': final_yaxis, 'scr_column': final_scr, 'preds_column': final_pred})
611
+
612
+ print("Grouped df after sorting based on height")
613
+ print_df(final_grouped_df)
614
+
615
+ return final_grouped_df
616
+
617
+
618
+
619
+ # searches the set of labels in the whole range
620
+ def search_labelSet_height_range(df, d, keyList):
621
+ print("search_labelSet_height_range")
622
+ keyDict = dict.fromkeys(keyList, []) #stores the required information as dictonary, then coverted to df
623
+ print("Dataframe from extraction is going to happen: ")
624
+
625
+ for i in range(df.shape[0]): # search df for right-bottom y axis value and check if it lies within the range d.
626
+ box = df['bbox_column'].iloc[i]
627
+ if dist_height(box[1], d)<50:
628
+ key = df['preds_column'].iloc[i]
629
+ keyDict[key] = df['scr_column'].iloc[i]
630
+ return keyDict
631
+
632
+
633
+ def clean_colText(df, column):
634
+ for i in range(df.shape[0]):
635
+ df[column].iloc[i] = df[column].iloc[i].replace('[', '').replace('|', '').replace('+', '')
636
+ return df
637
+
638
+
639
+ def find_repeatingLabels(df, labels_repeating):
640
+ print("In find_repeatingLabels: ")
641
+ row2drop = [] # dropping the rows that have been covered in previous dataframe
642
+ for i in range(df.shape[0]):
643
+ df['preds_column'].iloc[i] = id2label_row(df['preds_column'].iloc[i], id2label)
644
+ if df['preds_column'].iloc[i] not in labels_repeating:
645
+ row2drop.append(i)
646
+ df.drop(index = row2drop, inplace = True)
647
+ df = clean_colText(df, 'scr_column')
648
+
649
+ print("removing non-tabular labels.")
650
+
651
+ df = mergeLabelsExtensive_repeating(df,labels_repeating)
652
+ print('after merging non-tabular labels: ')
653
+
654
+ labels_repeating = list(set(list(df["preds_column"])))
655
+ print("labels_repeating in this document are: ",labels_repeating)
656
+ # adding extra column that contains the Y-axis information (Height)
657
+ df['y_axis'] = np.NaN
658
+ for i in range(df.shape[0]):
659
+ box = df['bbox_column'].iloc[i]
660
+ df['y_axis'].iloc[i] = box[1]
661
+
662
+ print("After adding y-axis data in the dataframes: ")
663
+ df = mergeLabelsExtensive(df)
664
+ print("aftermerging the df extensively")
665
+ print("Grouping the labels wrt heights: ")
666
+ grouped_df = group_labels_wrt_height(df)
667
+
668
+ #once labels are grouped, now we will create dictionaries for labels and values occuring in single line
669
+ row_dicts = [] # will contains each row of df as single dictionary.
670
+ for _, row in grouped_df.iterrows():
671
+ row_dict = {}
672
+ for preds, scr in zip(row['preds_column'], row['scr_column']):
673
+ row_dict[preds] = scr
674
+ row_dicts.append(row_dict)
675
+
676
+ #creating new
677
+ final_df = pd.DataFrame(columns=labels_repeating)
678
+ for d in row_dicts:
679
+ final_df = final_df.append(d, ignore_index=True)
680
+ final_df = final_df.fillna('')
681
+ return final_df
682
+
683
+
684
+ def mergeImageVertical(images):
685
+ # pick the image which is the smallest, and resize the others to match it (can be arbitrary image shape here)
686
+ min_shape = sorted( [(np.sum(i.size), i.size ) for i in images])[0][1]
687
+ imgs_comb = np.hstack([i.resize(min_shape) for i in images])
688
+ # for a vertical stacking it is simple: use vstack
689
+ imgs_comb = np.vstack([i.resize(min_shape) for i in images])
690
+ imgs_comb = Image.fromarray(imgs_comb)
691
+ return imgs_comb
692
+
693
+ def perform_erosion(img):
694
+ # Check if the image is already in grayscale
695
+ if len(img.shape) == 2:
696
+ gray = img
697
+ else:
698
+ # Convert the image to grayscale
699
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
700
+
701
+ # Define the kernel for erosion and dilation
702
+ kernel = np.ones((3, 3), np.uint8)
703
+
704
+ # Perform erosion followed by dilation
705
+ erosion = cv2.erode(gray, kernel, iterations=1)
706
+ dilation = cv2.dilate(erosion, kernel, iterations=1)
707
+
708
+ # Double the size of the image
709
+ double_size = cv2.resize(gray, None, fx=2, fy=2, interpolation=cv2.INTER_LINEAR)
710
+
711
+ # Perform erosion on the doubled image
712
+ double_erosion = cv2.erode(double_size, kernel, iterations=1)
713
+
714
+ return double_erosion
715
+
716
+
717
+
718
+ def remove_leading_trailing_special_characters(input_string):
719
+ cleaned_string = re.sub(r'^[^A-Za-z0-9]+|[^A-Za-z0-9]+$', '', str(input_string))
720
+ return cleaned_string
721
+
722
+ def clean_dataframe(df):
723
+ # Apply the remove_leading_trailing_special_characters function to all string columns
724
+ for column in df.select_dtypes(include='object').columns:
725
+ df[column] = df[column].apply(remove_leading_trailing_special_characters)
726
+
727
+ # Remove rows with all NaN or blank values
728
+ df = df.fillna('') # Replace NaN values with blank
729
+ return df
730
+
731
+ def mergeLabelsExtensive(df_grouped):
732
+ i = 0
733
+ while i < df_grouped.shape[0]:
734
+ merge_labels = [i] # collects indices whose data has been merged, so we need to delete it now.
735
+ label = df_grouped['preds_column'].iloc[i]
736
+ box1 = df_grouped['bbox_column'].iloc[i]
737
+
738
+ for j in range(i+1, df_grouped.shape[0]):
739
+ box2 = df_grouped['bbox_column'].iloc[j]
740
+ if label == df_grouped['preds_column'].iloc[j] and dist_height(box1[3], box2[3])<20: # which are in the vicinity of 20 pixels.
741
+ merge_labels.append(j)
742
+ print_df(df_grouped)
743
+ df_grouped = transform_dataset(df_grouped, merge_labels)
744
+ i = i+1
745
+ return df_grouped
746
+
747
+ def multilabelsHandle(df, thermo_details):
748
+ # Since 0 is assigned to 'others' and these values are not so important. We delete these values.
749
+ df = df[df.preds_column != 0]
750
+ df.reset_index(drop=True, inplace=True)
751
+ for i in range(df.shape[0]):
752
+ df['preds_column'].iloc[i] = id2label.get(df['preds_column'].iloc[i])
753
+ df['preds_column'].unique()
754
+ df_grouped = df.copy() #stores the index of relevant labels.
755
+ df_grouped.shape[0]
756
+ for i in range(df.shape[0]):
757
+ if df['preds_column'].iloc[i] not in thermo_details:
758
+ df_grouped.drop(i, inplace = True)
759
+ df_grouped.reset_index(drop=True, inplace=True)
760
+
761
+ keyList = df_grouped['preds_column'].unique()
762
+ df_grouped = mergeLabelsExtensive(df_grouped)
763
+
764
+ # extract the height of boxes
765
+ df_grouped = extract_yaxis(df_grouped)
766
+ shipment_labels = ['delivery_name','delivery_address','contact_phone']
767
+ # shipment
768
+ heights_shipment = get_heights(df_grouped, shipment_labels)
769
+
770
+ # now segregating the other repeating values in df like measiure, weight, volume etc.
771
+ # they will be containeed within the heights, as they act as boudaries.
772
+ df_labelSet = pd.DataFrame(columns= thermo_details)
773
+ for i in range(len(heights_shipment)):
774
+ if i == len(heights_shipment)-1:
775
+ new_df = search_labelSet_between_h1_h2(df_grouped, heights_shipment[i], 5000, keyList)
776
+ else:
777
+ new_df = search_labelSet_between_h1_h2(df_grouped, heights_shipment[i], heights_shipment[i+1], keyList)
778
+ df_labelSet = df_labelSet.append(new_df, ignore_index=True)
779
+ return df_labelSet
780
+
781
+
782
+ def completepreprocess(pdffile,ocr_type):
783
+ myDataFrame = pd.DataFrame()
784
+ myDataFrame2 = pd.DataFrame()
785
+ merge_pages=[]
786
+ doc = fitz.open(pdffile)
787
+ for i in range(0, len(doc)):
788
+ page = doc.load_page(i)
789
+ zoom = 2
790
+ mat = fitz.Matrix(zoom, zoom)
791
+ pix = page.get_pixmap(matrix = mat, dpi = 300)
792
+ image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
793
+ ro_image = rotate_image(image)
794
+ if ro_image is None:
795
+ return None
796
+ angle, skewed_image = correct_skew(ro_image)
797
+ if skewed_image is None:
798
+ return None
799
+ remove_border = removeBorders(skewed_image)
800
+ image = Image.fromarray(remove_border)
801
+ width,height=image.size
802
+ label2color = generate_unique_colors(id2label)
803
+ width,height=image.size
804
+ if ocr_type == "GoogleVisionOCR":
805
+ words, boxes = process_image_GoogleVision(image, width, height)
806
+ else:
807
+ words, boxes = process_image_pytesseract(image, width, height)
808
+
809
+ bbox, preds, words, image = process_image_encoding(model, processor, image, words, boxes,width,height)
810
+ im, df_visualize = visualize_image(bbox, preds, words, image,label2color)
811
+ df_main = process_form_(df_visualize)
812
+
813
+ bbox_column = bbox
814
+ preds_column = preds
815
+ scr_column = words
816
+
817
+ # dictionary of lists
818
+ dict = {'bbox_column': bbox_column, 'preds_column': preds_column, 'scr_column': scr_column}
819
+ df_single_page = pd.DataFrame(dict)
820
+ labels_repeating = ['art_code', 'ref_code', 'detail_desc','lot_id','detail_qty','detail_um','detail_tare','detail_grossw','detail_netw','detail_origin','varieta','raccolta']
821
+ df_repeating_page = find_repeatingLabels(df_single_page, labels_repeating)
822
+ myDataFrame2= myDataFrame2.append(df_repeating_page,sort=False)
823
+
824
+ df1=add_dataframe(df_main,labels_repeating,label2color).astype(str)
825
+ myDataFrame= myDataFrame.append(df1,sort=False).reset_index(drop = True)
826
+ myDataFrame['value'].apply(len)
827
+ row2drop = []
828
+ for i in range(myDataFrame.shape[0]):
829
+ if len( myDataFrame['value'].iloc[i]) ==0:
830
+ row2drop.append(i)
831
+ myDataFrame.drop(index = row2drop, inplace = True)
832
+ myDataFrame.reset_index(drop = True, inplace = True)
833
+ myDataFrame = myDataFrame[myDataFrame["value"].notnull()]
834
+ myDataFrame.drop_duplicates(subset=["key"],inplace=True)
835
+ myDataFrame2 = myDataFrame2.loc[:, ~(myDataFrame2.apply(lambda x: all(isinstance(val, list) and len(val) == 0 for val in x)))]
836
+ merge_pages.append(im)
837
+ im2=mergeImageVertical(merge_pages)
838
+ myDataFrame2 = clean_dataframe(myDataFrame2)
839
+ myDataFrame = clean_dataframe(myDataFrame)
840
+ myDataFrame = myDataFrame[myDataFrame['key'] != 'others']
841
+ output_excel_path = createExcel(myDataFrame, myDataFrame2, pdffile.name)
842
+ return im2,myDataFrame,myDataFrame2,output_excel_path
843
+
844
+
845
+ title = "Interactive demo: Transport Document Information Extraction model PDF/Images"
846
+ description = "Results will show up in a few seconds. This model is trained on only 1326 Images whereas 226 images are used for testing purposes. The annotated image can be opened in a new window for a better view."
847
+
848
+ css = """.output_image, .input_image {height: 600px !important}"""
849
+ #examples = [ ]
850
+
851
+ iface = gr.Interface(
852
+ fn=completepreprocess,
853
+ inputs=[
854
+ gr.components.File(label="PDF"),
855
+ gr.components.Dropdown(label="Select the OCR", choices=["Pytesseract","GoogleVisionOCR"]),
856
+ ],
857
+ outputs=[
858
+ gr.components.Image(type="pil", label="annotated image"),
859
+ "dataframe",
860
+ "dataframe",
861
+ gr.File(label="Excel output")
862
+ ],
863
+ title=title,
864
+ description=description,
865
+ examples=examples,
866
+ css=css
867
+ )
868
+
869
+ iface.launch(inline=True, debug=True)