Spaces:
Running
Running
creating app.py
Browse files
app.py
ADDED
@@ -0,0 +1,869 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
os.system('pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu')
|
3 |
+
import glob, fitz
|
4 |
+
import PIL
|
5 |
+
import re
|
6 |
+
import torch
|
7 |
+
import cv2
|
8 |
+
import pytesseract
|
9 |
+
import pandas as pd
|
10 |
+
import numpy as np
|
11 |
+
import gradio as gr
|
12 |
+
from PIL import Image
|
13 |
+
from tqdm import tqdm
|
14 |
+
from difflib import SequenceMatcher
|
15 |
+
from itertools import groupby
|
16 |
+
from scipy import ndimage
|
17 |
+
from scipy.ndimage import interpolation as inter
|
18 |
+
from datasets import load_metric
|
19 |
+
from datasets import load_dataset
|
20 |
+
from datasets.features import ClassLabel
|
21 |
+
from transformers import AutoProcessor
|
22 |
+
from PIL import Image, ImageDraw, ImageFont
|
23 |
+
from transformers import AutoModelForTokenClassification
|
24 |
+
from transformers.data.data_collator import default_data_collator
|
25 |
+
from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D
|
26 |
+
from transformers import LayoutLMv3ForTokenClassification,LayoutLMv3FeatureExtractor,LayoutLMv3ImageProcessor
|
27 |
+
import io
|
28 |
+
# import paddleocr
|
29 |
+
# from paddleocr import PaddleOCR
|
30 |
+
auth_token = os.environ.get("HUGGING_FACE_HUB_TOKEN")
|
31 |
+
import warnings
|
32 |
+
# Ignore warning messages
|
33 |
+
warnings.filterwarnings("ignore")
|
34 |
+
|
35 |
+
id2label= {0: 'others', 1: 'issuer_name', 2: 'issuer_addr', 3: 'issuer_cap', 4: 'issuer_city', 5: 'issuer_prov', 6: 'issuer_state', 7: 'issuer_tel', 8: 'issuer_id', 9: 'issuer_fax', 10: 'issuer_vat', 11: 'issuer_contact', 12: 'issuer_contact_email', 13: 'issuer_contact_phone', 14: 'receiver_name', 15: 'receiver_addr', 16: 'receiver_cap', 17: 'receiver_city', 18: 'receiver_prov', 19: 'receiver_state', 20: 'receiver_tel', 21: 'receiver_fax', 22: 'receiver_vat', 23: 'receiver_id', 24: 'receiver_contact', 25: 'dest_name', 26: 'dest_addr', 27: 'dest_cap', 28: 'dest_city', 29: 'dest_prov', 30: 'dest_state', 31: 'dest_tel', 32: 'dest_fax', 33: 'dest_vat', 34: 'doc_type', 35: 'doc_nr', 36: 'doc_date', 37: 'order_nr', 38: 'order_date', 39: 'service_order', 40: 'shipment_nr', 41: 'client_reference', 42: 'client_vat', 43: 'client_id', 44: 'client_code', 45: 'time', 46: 'notes', 47: 'client_tel', 48: 'art_code', 49: 'ref_code', 50: 'order_reason', 51: 'order_ref', 52: 'order_ref_date', 53: 'detail_desc', 54: 'lot_id', 55: 'lot_qty', 56: 'detail_um', 57: 'detail_qty', 58: 'detail_tare', 59: 'detail_grossw', 60: 'detail_packages', 61: 'detail_netw', 62: 'detail_origin', 63: 'payment_bank', 64: 'payment_terms', 65: 'tot_qty', 66: 'tot_grossw', 67: 'tot_netw', 68: 'tot_volume', 69: 'shipment_reason', 70: 'package_type', 71: 'transport_respons', 72: 'transport_vectors', 73: 'transport_terms', 74: 'transport_datetime', 75: 'return_plt', 76: 'nonreturn_plt', 77: 'dest_signature', 78: 'driver_signature', 79: 'transport_signature', 80: 'page', 81: 'varieta', 82: 'raccolta', 83: 'detail_volume'}
|
36 |
+
custom_config = r'--oem 3 --psm 6'
|
37 |
+
lang='eng'
|
38 |
+
|
39 |
+
|
40 |
+
#Google Vision OCR
|
41 |
+
from google.cloud import vision_v1p3beta1 as vision
|
42 |
+
from google.cloud import vision_v1p3beta1 as vision
|
43 |
+
from google.cloud import vision
|
44 |
+
# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "test-apikey.json"
|
45 |
+
|
46 |
+
processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
|
47 |
+
model = AutoModelForTokenClassification.from_pretrained("sxandie/doc-ai-information-extraction",use_auth_token=auth_token)
|
48 |
+
|
49 |
+
from tabulate import tabulate
|
50 |
+
def print_df(df):
|
51 |
+
print(tabulate(df, headers = df.columns, tablefmt = 'psql'))
|
52 |
+
|
53 |
+
|
54 |
+
def process_image_pytesseract(image,width,height):
|
55 |
+
width, height = image.size
|
56 |
+
feature_extractor = LayoutLMv3ImageProcessor(apply_ocr=True,lang=lang)
|
57 |
+
encoding_feature_extractor = feature_extractor(image, return_tensors="pt",truncation=True)
|
58 |
+
words, boxes = encoding_feature_extractor.words, encoding_feature_extractor.boxes
|
59 |
+
return words,boxes
|
60 |
+
|
61 |
+
def create_bounding_box5(vertices, width_scale, height_scale):
|
62 |
+
|
63 |
+
# Get the x, y coordinates
|
64 |
+
x1 = int(vertices[0].x * width_scale)
|
65 |
+
y1 = int(vertices[0].y * height_scale)
|
66 |
+
|
67 |
+
x2 = int(vertices[2].x * width_scale)
|
68 |
+
y2 = int(vertices[2].y * height_scale)
|
69 |
+
|
70 |
+
# Validate x1 < x2
|
71 |
+
if x1 > x2:
|
72 |
+
x1, x2 = x2, x1
|
73 |
+
|
74 |
+
# Validate y1 < y2
|
75 |
+
if y1 > y2:
|
76 |
+
y1, y2 = y2, y1
|
77 |
+
|
78 |
+
# Return valid bounding box
|
79 |
+
return [x1, y1, x2, y2]
|
80 |
+
|
81 |
+
#Google Vision OCR
|
82 |
+
def process_image_GoogleVision(image, width, height):
|
83 |
+
inference_image = [image.convert("RGB")]
|
84 |
+
client = vision.ImageAnnotatorClient()
|
85 |
+
with io.BytesIO() as output:
|
86 |
+
image.save(output, format='JPEG')
|
87 |
+
content = output.getvalue()
|
88 |
+
image = vision.Image(content=content)
|
89 |
+
|
90 |
+
response = client.text_detection(image=image)
|
91 |
+
texts = response.text_annotations
|
92 |
+
|
93 |
+
# Get the bounding box vertices and remove the first item
|
94 |
+
bboxes = [text.bounding_poly.vertices[1:] for text in texts]
|
95 |
+
# Create the list of words and boxes
|
96 |
+
words = [text.description for text in texts]
|
97 |
+
boxes = [create_bounding_box5(bbox, 1000/width, 1000/height) for bbox in bboxes]
|
98 |
+
return words,boxes
|
99 |
+
|
100 |
+
|
101 |
+
def generate_unique_colors(id2label):
|
102 |
+
# Generate unique colors
|
103 |
+
label_ints = np.random.choice(len(PIL.ImageColor.colormap), len(id2label), replace=False)
|
104 |
+
label_color_pil = list(PIL.ImageColor.colormap.values())
|
105 |
+
label_color = [label_color_pil[i] for i in label_ints]
|
106 |
+
|
107 |
+
color = {}
|
108 |
+
for k, v in id2label.items():
|
109 |
+
if v[:2] == '':
|
110 |
+
color['o'] = label_color[k]
|
111 |
+
else:
|
112 |
+
color[v[0:]] = label_color[k]
|
113 |
+
|
114 |
+
return color
|
115 |
+
|
116 |
+
def create_bounding_box1(bbox_data, width_scale: float, height_scale: float):
|
117 |
+
xs = []
|
118 |
+
ys = []
|
119 |
+
for x, y in bbox_data:
|
120 |
+
xs.append(x)
|
121 |
+
ys.append(y)
|
122 |
+
|
123 |
+
left = int(max(0, min(xs) * width_scale))
|
124 |
+
top = int(max(0, min(ys) * height_scale))
|
125 |
+
right = int(min(1000, max(xs) * width_scale))
|
126 |
+
bottom = int(min(1000, max(ys) * height_scale))
|
127 |
+
|
128 |
+
return [left, top, right, bottom]
|
129 |
+
|
130 |
+
|
131 |
+
|
132 |
+
def unnormalize_box(bbox, width, height):
|
133 |
+
return [
|
134 |
+
width * (bbox[0] / 1000),
|
135 |
+
height * (bbox[1] / 1000),
|
136 |
+
width * (bbox[2] / 1000),
|
137 |
+
height * (bbox[3] / 1000),
|
138 |
+
]
|
139 |
+
|
140 |
+
|
141 |
+
def iob_to_label(label):
|
142 |
+
return id2label.get(label, 'others')
|
143 |
+
|
144 |
+
def process_image(image):
|
145 |
+
custom_config = r'--oem 3 --psm 6'
|
146 |
+
# lang='eng+deu+ita+chi_sim'
|
147 |
+
lang='eng'
|
148 |
+
width, height = image.size
|
149 |
+
feature_extractor = LayoutLMv3FeatureExtractor(apply_ocr=True)
|
150 |
+
encoding_feature_extractor = feature_extractor(image, return_tensors="pt",truncation=True)
|
151 |
+
words, boxes = encoding_feature_extractor.words, encoding_feature_extractor.boxes
|
152 |
+
|
153 |
+
custom_config = r'--oem 3 --psm 6'
|
154 |
+
# encode
|
155 |
+
inference_image = [image.convert("RGB")]
|
156 |
+
encoding = processor(inference_image , truncation=True, return_offsets_mapping=True, return_tensors="pt", padding="max_length", stride =128, max_length=512, return_overflowing_tokens=True)
|
157 |
+
offset_mapping = encoding.pop('offset_mapping')
|
158 |
+
overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping')
|
159 |
+
|
160 |
+
# change the shape of pixel values
|
161 |
+
x = []
|
162 |
+
for i in range(0, len(encoding['pixel_values'])):
|
163 |
+
x.append(encoding['pixel_values'][i])
|
164 |
+
x = torch.stack(x)
|
165 |
+
encoding['pixel_values'] = x
|
166 |
+
|
167 |
+
# forward pass
|
168 |
+
outputs = model(**encoding)
|
169 |
+
|
170 |
+
# get predictions
|
171 |
+
predictions = outputs.logits.argmax(-1).squeeze().tolist()
|
172 |
+
token_boxes = encoding.bbox.squeeze().tolist()
|
173 |
+
|
174 |
+
# only keep non-subword predictions
|
175 |
+
preds = []
|
176 |
+
l_words = []
|
177 |
+
bboxes = []
|
178 |
+
token_section_num = []
|
179 |
+
|
180 |
+
if (len(token_boxes) == 512):
|
181 |
+
predictions = [predictions]
|
182 |
+
token_boxes = [token_boxes]
|
183 |
+
|
184 |
+
|
185 |
+
for i in range(0, len(token_boxes)):
|
186 |
+
for j in range(0, len(token_boxes[i])):
|
187 |
+
unnormal_box = unnormalize_box(token_boxes[i][j], width, height)
|
188 |
+
if (np.asarray(token_boxes[i][j]).shape != (4,)):
|
189 |
+
continue
|
190 |
+
elif (token_boxes[i][j] == [0, 0, 0, 0] or token_boxes[i][j] == 0):
|
191 |
+
#print('zero found!')
|
192 |
+
continue
|
193 |
+
# if bbox is available in the list, just we need to update text
|
194 |
+
elif (unnormal_box not in bboxes):
|
195 |
+
preds.append(predictions[i][j])
|
196 |
+
l_words.append(processor.tokenizer.decode(encoding["input_ids"][i][j]))
|
197 |
+
bboxes.append(unnormal_box)
|
198 |
+
token_section_num.append(i)
|
199 |
+
else:
|
200 |
+
# we have to update the word
|
201 |
+
_index = bboxes.index(unnormal_box)
|
202 |
+
if (token_section_num[_index] == i):
|
203 |
+
# check if they're in a same section or not (documents with more than 512 tokens will divide to seperate
|
204 |
+
# parts, so it's possible to have a word in both of the pages and we have to control that repetetive words
|
205 |
+
# HERE: because they're in a same section, so we can merge them safely
|
206 |
+
l_words[_index] = l_words[_index] + processor.tokenizer.decode(encoding["input_ids"][i][j])
|
207 |
+
|
208 |
+
else:
|
209 |
+
continue
|
210 |
+
|
211 |
+
return bboxes, preds, l_words, image
|
212 |
+
|
213 |
+
|
214 |
+
|
215 |
+
def process_image_encoding(model, processor, image, words, boxes,width,height):
|
216 |
+
# encode
|
217 |
+
inference_image = [image.convert("RGB")]
|
218 |
+
encoding = processor(inference_image ,words,boxes=boxes, truncation=True, return_offsets_mapping=True, return_tensors="pt",
|
219 |
+
padding="max_length", stride =128, max_length=512, return_overflowing_tokens=True)
|
220 |
+
offset_mapping = encoding.pop('offset_mapping')
|
221 |
+
overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping')
|
222 |
+
|
223 |
+
# change the shape of pixel values
|
224 |
+
x = []
|
225 |
+
for i in range(0, len(encoding['pixel_values'])):
|
226 |
+
x.append(encoding['pixel_values'][i])
|
227 |
+
x = torch.stack(x)
|
228 |
+
encoding['pixel_values'] = x
|
229 |
+
|
230 |
+
# forward pass
|
231 |
+
outputs = model(**encoding)
|
232 |
+
|
233 |
+
# get predictions
|
234 |
+
predictions = outputs.logits.argmax(-1).squeeze().tolist()
|
235 |
+
token_boxes = encoding.bbox.squeeze().tolist()
|
236 |
+
|
237 |
+
# only keep non-subword predictions
|
238 |
+
preds = []
|
239 |
+
l_words = []
|
240 |
+
bboxes = []
|
241 |
+
token_section_num = []
|
242 |
+
|
243 |
+
if (len(token_boxes) == 512):
|
244 |
+
predictions = [predictions]
|
245 |
+
token_boxes = [token_boxes]
|
246 |
+
|
247 |
+
for i in range(0, len(token_boxes)):
|
248 |
+
for j in range(0, len(token_boxes[i])):
|
249 |
+
unnormal_box = unnormalize_box(token_boxes[i][j], width, height)
|
250 |
+
if (np.asarray(token_boxes[i][j]).shape != (4,)):
|
251 |
+
continue
|
252 |
+
elif (token_boxes[i][j] == [0, 0, 0, 0] or token_boxes[i][j] == 0):
|
253 |
+
#print('zero found!')
|
254 |
+
continue
|
255 |
+
# if bbox is available in the list, just we need to update text
|
256 |
+
elif (unnormal_box not in bboxes):
|
257 |
+
preds.append(predictions[i][j])
|
258 |
+
l_words.append(processor.tokenizer.decode(encoding["input_ids"][i][j]))
|
259 |
+
bboxes.append(unnormal_box)
|
260 |
+
token_section_num.append(i)
|
261 |
+
else:
|
262 |
+
# we have to update the word
|
263 |
+
_index = bboxes.index(unnormal_box)
|
264 |
+
if (token_section_num[_index] == i):
|
265 |
+
# check if they're in a same section or not (documents with more than 512 tokens will divide to seperate
|
266 |
+
# parts, so it's possible to have a word in both of the pages and we have to control that repetetive words
|
267 |
+
# HERE: because they're in a same section, so we can merge them safely
|
268 |
+
l_words[_index] = l_words[_index] + processor.tokenizer.decode(encoding["input_ids"][i][j])
|
269 |
+
else:
|
270 |
+
continue
|
271 |
+
|
272 |
+
return bboxes, preds, l_words, image
|
273 |
+
|
274 |
+
|
275 |
+
def process_form_(json_df):
|
276 |
+
|
277 |
+
labels = [x['LABEL'] for x in json_df]
|
278 |
+
texts = [x['TEXT'] for x in json_df]
|
279 |
+
cmb_list = []
|
280 |
+
for i, j in enumerate(labels):
|
281 |
+
cmb_list.append([labels[i], texts[i]])
|
282 |
+
|
283 |
+
grouper = lambda l: [[k] + sum((v[1::] for v in vs), []) for k, vs in groupby(l, lambda x: x[0])]
|
284 |
+
|
285 |
+
list_final = grouper(cmb_list)
|
286 |
+
lst_final = []
|
287 |
+
for x in list_final:
|
288 |
+
json_dict = {}
|
289 |
+
json_dict[x[0]] = (' ').join(x[1:])
|
290 |
+
lst_final.append(json_dict)
|
291 |
+
|
292 |
+
return lst_final
|
293 |
+
|
294 |
+
|
295 |
+
def createExcel(maindf, detailsdf, pdffile):
|
296 |
+
outputPath = f'{pdffile}.xlsx'
|
297 |
+
with pd.ExcelWriter(outputPath, engine='xlsxwriter') as writer:
|
298 |
+
maindf.to_excel(writer, sheet_name='headers', index=False)
|
299 |
+
detailsdf.to_excel(writer, sheet_name='details', index=False)
|
300 |
+
worksheet1 = writer.sheets["headers"]
|
301 |
+
for idx, col in enumerate(maindf):
|
302 |
+
series = maindf[col]
|
303 |
+
max_len = max((
|
304 |
+
series.astype(str).map(len).max(),
|
305 |
+
len(str(series.name))
|
306 |
+
)) + 1
|
307 |
+
worksheet1.set_column(idx, idx, max_len)
|
308 |
+
worksheet2 = writer.sheets["details"]
|
309 |
+
for idx, col in enumerate(detailsdf):
|
310 |
+
series = detailsdf[col]
|
311 |
+
max_len = max((
|
312 |
+
series.astype(str).map(len).max(),
|
313 |
+
len(str(series.name))
|
314 |
+
)) + 1
|
315 |
+
worksheet2.set_column(idx, idx, max_len)
|
316 |
+
return outputPath
|
317 |
+
|
318 |
+
|
319 |
+
def visualize_image(final_bbox, final_preds, l_words, image,label2color):
|
320 |
+
|
321 |
+
draw = ImageDraw.Draw(image)
|
322 |
+
font = ImageFont.load_default()
|
323 |
+
json_df = []
|
324 |
+
|
325 |
+
for ix, (prediction, box) in enumerate(zip(final_preds, final_bbox)):
|
326 |
+
if prediction is not None:
|
327 |
+
predicted_label = iob_to_label(prediction).lower()
|
328 |
+
if predicted_label not in ["others"]:
|
329 |
+
draw.rectangle(box, outline=label2color[predicted_label])
|
330 |
+
draw.text((box[0]+10, box[1]-10), text=predicted_label, fill=label2color[predicted_label], font=font)
|
331 |
+
json_dict = {}
|
332 |
+
json_dict['TEXT'] = l_words[ix]
|
333 |
+
json_dict['LABEL'] = label2color[predicted_label]
|
334 |
+
json_df.append(json_dict)
|
335 |
+
return image, json_df
|
336 |
+
|
337 |
+
def rotate_image(image):
|
338 |
+
extracted_text = pytesseract.image_to_string(image)
|
339 |
+
# check if the image contains any text
|
340 |
+
if not extracted_text:
|
341 |
+
print("The image does not contain any text.")
|
342 |
+
return None
|
343 |
+
elif extracted_text.isspace():
|
344 |
+
print("The image contains only spaces.")
|
345 |
+
return None
|
346 |
+
text = pytesseract.image_to_osd(image)
|
347 |
+
angle = int(re.search('(?<=Rotate: )\d+', text).group(0))
|
348 |
+
angle = 360 - angle
|
349 |
+
rotated = ndimage.rotate(image, angle)
|
350 |
+
data = Image.fromarray(rotated)
|
351 |
+
return data
|
352 |
+
|
353 |
+
|
354 |
+
# correct the skewness of images
|
355 |
+
def correct_skew(image, delta=1, limit=5):
|
356 |
+
def determine_score(arr, angle):
|
357 |
+
data = inter.rotate(arr, angle, reshape=False, order=0)
|
358 |
+
histogram = np.sum(data, axis=1, dtype=float)
|
359 |
+
score = np.sum((histogram[1:] - histogram[:-1]) ** 2, dtype=float)
|
360 |
+
return histogram, score
|
361 |
+
|
362 |
+
# Convert the PIL Image object to a numpy array
|
363 |
+
image = np.asarray(image.convert('L'), dtype=np.uint8)
|
364 |
+
|
365 |
+
# Apply thresholding
|
366 |
+
thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
|
367 |
+
|
368 |
+
scores = []
|
369 |
+
angles = np.arange(-limit, limit + delta, delta)
|
370 |
+
for angle in angles:
|
371 |
+
histogram, score = determine_score(thresh, angle)
|
372 |
+
scores.append(score)
|
373 |
+
best_angle = angles[scores.index(max(scores))]
|
374 |
+
|
375 |
+
(h, w) = image.shape[:2]
|
376 |
+
center = (w // 2, h // 2)
|
377 |
+
M = cv2.getRotationMatrix2D(center, best_angle, 1.0)
|
378 |
+
corrected = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, \
|
379 |
+
borderMode=cv2.BORDER_REPLICATE)
|
380 |
+
return best_angle, corrected
|
381 |
+
|
382 |
+
|
383 |
+
def removeBorders(img):
|
384 |
+
result = img.copy()
|
385 |
+
|
386 |
+
if len(result.shape) == 2:
|
387 |
+
# if the input image is grayscale, convert it to BGR format
|
388 |
+
result = cv2.cvtColor(result, cv2.COLOR_GRAY2BGR)
|
389 |
+
|
390 |
+
gray = cv2.cvtColor(result, cv2.COLOR_BGR2GRAY) # convert to grayscale
|
391 |
+
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
|
392 |
+
|
393 |
+
# Remove horizontal lines
|
394 |
+
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40,1))
|
395 |
+
remove_horizontal = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
|
396 |
+
cnts = cv2.findContours(remove_horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
397 |
+
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
|
398 |
+
for c in cnts:
|
399 |
+
cv2.drawContours(result, [c], -1, (255,255,255), 5)
|
400 |
+
|
401 |
+
# Remove vertical lines
|
402 |
+
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,40))
|
403 |
+
remove_vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
|
404 |
+
cnts = cv2.findContours(remove_vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
405 |
+
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
|
406 |
+
for c in cnts:
|
407 |
+
cv2.drawContours(result, [c], -1, (255,255,255), 5)
|
408 |
+
|
409 |
+
return result
|
410 |
+
|
411 |
+
def color2label_except(label2color, excluded_labels):
|
412 |
+
"""
|
413 |
+
Inversely maps colors to labels based on the provided label2color dictionary,
|
414 |
+
excluding the specified labels.
|
415 |
+
|
416 |
+
Args:
|
417 |
+
label2color (dict): Dictionary mapping labels to colors.
|
418 |
+
excluded_labels (list): List of labels to exclude.
|
419 |
+
|
420 |
+
Returns:
|
421 |
+
dict: Dictionary mapping colors to labels, excluding the specified labels.
|
422 |
+
"""
|
423 |
+
# Filter out excluded labels from label2color dictionary
|
424 |
+
filtered_label2color = {label: color for label, color in label2color.items() if label not in excluded_labels}
|
425 |
+
|
426 |
+
# Invert the filtered label2color dictionary to create color2label mapping
|
427 |
+
return {v: k for k, v in filtered_label2color.items()}
|
428 |
+
|
429 |
+
|
430 |
+
def add_dataframe(df_main,labels_repeating,label2color):
|
431 |
+
col_name_map =color2label_except(label2color,labels_repeating)
|
432 |
+
|
433 |
+
columns = list(col_name_map.values())
|
434 |
+
data = {col:[] for col in columns}
|
435 |
+
for i in df_main:
|
436 |
+
for k, v in i.items():
|
437 |
+
if k in col_name_map:
|
438 |
+
data[col_name_map[k]].append(v)
|
439 |
+
|
440 |
+
# join the list of strings for each column and convert to a dataframe
|
441 |
+
for col in columns:
|
442 |
+
data[col] = [' '.join(data[col])]
|
443 |
+
df_upper = pd.DataFrame(data)
|
444 |
+
key_value_pairs = []
|
445 |
+
for col in df_upper.columns:
|
446 |
+
key_value_pairs.append({'key': col, 'value': df_upper[col][0]})
|
447 |
+
df_key_value = pd.DataFrame(key_value_pairs)
|
448 |
+
# Extract the value from the containertype column
|
449 |
+
# container_quantity = int(df_key_value[df_key_value['key'] == 'containertype']['value'].str.split("x").str[0])
|
450 |
+
|
451 |
+
# # Add a new row to the DataFrame
|
452 |
+
# df_key_value = df_key_value.append({'key': 'containerquantity', 'value': container_quantity}, ignore_index=True)
|
453 |
+
|
454 |
+
# # Extract the desired value from the containertype column
|
455 |
+
# df_key_value.loc[df_key_value['key'] == 'containertype', 'value'] = df_key_value.loc[df_key_value['key'] == 'containertype', 'value'].str.split("x").str[1]
|
456 |
+
return df_key_value
|
457 |
+
|
458 |
+
|
459 |
+
import statistics
|
460 |
+
|
461 |
+
def id2label_row(s, id2label):
|
462 |
+
if s in id2label.values():
|
463 |
+
return s
|
464 |
+
return id2label[s]
|
465 |
+
|
466 |
+
def dist_height(y1,y2):
|
467 |
+
return abs(int(y1)- int(y2))
|
468 |
+
|
469 |
+
|
470 |
+
def mergeBoxes(df):
|
471 |
+
xmin, ymin, xmax, ymax = [], [], [], []
|
472 |
+
for i in range(df.shape[0]):
|
473 |
+
box = df['bbox_column'].iloc[i]
|
474 |
+
xmin.append(box[0])
|
475 |
+
ymin.append(box[1])
|
476 |
+
xmax.append(box[2])
|
477 |
+
ymax.append(box[3])
|
478 |
+
return [min(xmin), min(ymin), max(xmax), max(ymax)]
|
479 |
+
|
480 |
+
|
481 |
+
def transform_dataset(df, merge_labels):
|
482 |
+
df_temp = df.iloc[merge_labels] # a duplicate df with only concerned rows
|
483 |
+
df_temp.reset_index(drop = True, inplace = True)
|
484 |
+
text = ' '.join(df_temp['scr_column'])
|
485 |
+
bbox = mergeBoxes(df_temp)
|
486 |
+
retain_index = merge_labels[0] #the first index is parent row
|
487 |
+
df['scr_column'].iloc[retain_index] = text
|
488 |
+
df['bbox_column'].iloc[retain_index] = bbox
|
489 |
+
# keeping the first & removing rest
|
490 |
+
df = df.loc[~df.index.isin(merge_labels[1:]), :]
|
491 |
+
df.reset_index(drop = True, inplace = True)
|
492 |
+
return df
|
493 |
+
|
494 |
+
|
495 |
+
def box_overlap(box1, box2, horizontal_vertical):
|
496 |
+
# Extract coordinates of box1
|
497 |
+
x1_box1, y1_box1, x2_box1, y2_box1 = box1
|
498 |
+
# Extract coordinates of box2
|
499 |
+
x1_box2, y1_box2, x2_box2, y2_box2 = box2
|
500 |
+
|
501 |
+
# Check if boxes overlap horizontally and vertically
|
502 |
+
if horizontal_vertical == "H":
|
503 |
+
if x1_box1 <= x2_box2 and x2_box1 >= x1_box2:
|
504 |
+
return True
|
505 |
+
else:
|
506 |
+
return False
|
507 |
+
if horizontal_vertical == "V":
|
508 |
+
if y1_box1 <= y2_box2 and y2_box1 >= y1_box2:
|
509 |
+
return True
|
510 |
+
else:
|
511 |
+
return False
|
512 |
+
|
513 |
+
|
514 |
+
def horizonatal_merging(df, font_length, perform_overlapping =False, x_change = 0, y_change = 0):
|
515 |
+
fat_df = df.copy()
|
516 |
+
for i in range(df.shape[0]):
|
517 |
+
box = fat_df['bbox_column'].iloc[i]
|
518 |
+
fat_df['bbox_column'].iloc[i] = [box[0]-x_change, box[1]-y_change, box[2]+x_change, box[3] + y_change]
|
519 |
+
if perform_overlapping == True:
|
520 |
+
redundant_rows = []
|
521 |
+
for i in range(fat_df.shape[0]):
|
522 |
+
box_i = fat_df.bbox_column[i]
|
523 |
+
indices2merge = []
|
524 |
+
|
525 |
+
for j in range(i+1, fat_df.shape[0]):
|
526 |
+
if fat_df.preds_column[j] == fat_df.preds_column[i]: # if labels are same
|
527 |
+
box_j = fat_df.bbox_column[j]
|
528 |
+
if abs(box_i[1]-box_j[3])<font_length*1.5: # if the boxes are at height within 50% more range of font size
|
529 |
+
# Check if boxes overlap horizontally
|
530 |
+
if box_overlap(box_i, box_j, 'H'):
|
531 |
+
indices2merge.append(j)
|
532 |
+
df.scr_column[i] += df.scr_column[j]
|
533 |
+
box_i = fat_df.bbox_column[j] # finding the next connected word
|
534 |
+
|
535 |
+
#once we have all indices that belong to a particular category
|
536 |
+
# merging the boundong boxes, keeping them in 1st note/row.
|
537 |
+
if len(indices2merge)!=0:
|
538 |
+
df['bbox_column'].iloc[i] = mergeBoxes(df.loc[indices2merge])
|
539 |
+
redundant_rows.extend(indices2merge)
|
540 |
+
|
541 |
+
#now since all the transformation is done, lets remove the redundant rows
|
542 |
+
return df.drop(redundant_rows)
|
543 |
+
|
544 |
+
|
545 |
+
def mergeLabelsExtensive_repeating(df_grouped, repeating_label):
|
546 |
+
df_grouped.reset_index(inplace = True, drop = True)
|
547 |
+
# this function merges same label entities together in a single instance.
|
548 |
+
df_grouped = df_grouped[df_grouped['preds_column'].isin(repeating_label)]
|
549 |
+
font_length =0
|
550 |
+
count = 0
|
551 |
+
while count<5 and count<df_grouped.shape[0]:
|
552 |
+
box_i = df_grouped['bbox_column'].iloc[count] # box of current label contains [x1,y1,x3,y3]
|
553 |
+
font_length += box_i[3]-box_i[1]
|
554 |
+
count +=1
|
555 |
+
font_length = font_length/5
|
556 |
+
|
557 |
+
df_grouped = horizonatal_merging(df_grouped, font_length, True, 30, 0)
|
558 |
+
return df_grouped
|
559 |
+
|
560 |
+
|
561 |
+
|
562 |
+
def group_labels_wrt_height(df):
|
563 |
+
"""
|
564 |
+
This function groups the labels based on the height of the bounding box.
|
565 |
+
"""
|
566 |
+
#sorting the lines based on heights using column 'y_axis'
|
567 |
+
df = df.sort_values(by='y_axis')
|
568 |
+
df.reset_index(inplace = True, drop = True)
|
569 |
+
print("entering: group_labels_wrt_height ")
|
570 |
+
|
571 |
+
final_yaxis = []
|
572 |
+
final_scr = []
|
573 |
+
final_pred = []
|
574 |
+
|
575 |
+
current_group = []
|
576 |
+
current_scr = []
|
577 |
+
current_pred = []
|
578 |
+
|
579 |
+
|
580 |
+
# Iterate through the column values
|
581 |
+
for i, (value,scr,preds ) in enumerate(zip(df['y_axis'], df['scr_column'], df['preds_column'])):
|
582 |
+
if i == 0:
|
583 |
+
# Start a new group with the first value
|
584 |
+
current_group.append(value)
|
585 |
+
current_scr.append(scr)
|
586 |
+
current_pred.append(preds)
|
587 |
+
else:
|
588 |
+
# Check if the difference between the current value and the previous value is <= 20
|
589 |
+
if abs(value - df['y_axis'][i - 1]) <= 35:
|
590 |
+
# Add the value to the current group
|
591 |
+
current_group.append(value)
|
592 |
+
current_scr.append(scr)
|
593 |
+
current_pred.append(preds)
|
594 |
+
else:
|
595 |
+
# Start a new group with the current value
|
596 |
+
final_yaxis.append(current_group)
|
597 |
+
final_scr.append(current_scr)
|
598 |
+
final_pred.append(current_pred)
|
599 |
+
|
600 |
+
current_group = [value]
|
601 |
+
current_scr = [scr]
|
602 |
+
current_pred = [preds]
|
603 |
+
|
604 |
+
|
605 |
+
# Add the last group
|
606 |
+
final_yaxis.append(current_group)
|
607 |
+
final_scr.append(current_scr)
|
608 |
+
final_pred.append(current_pred)
|
609 |
+
|
610 |
+
final_grouped_df = pd.DataFrame({'y_axis': final_yaxis, 'scr_column': final_scr, 'preds_column': final_pred})
|
611 |
+
|
612 |
+
print("Grouped df after sorting based on height")
|
613 |
+
print_df(final_grouped_df)
|
614 |
+
|
615 |
+
return final_grouped_df
|
616 |
+
|
617 |
+
|
618 |
+
|
619 |
+
# searches the set of labels in the whole range
|
620 |
+
def search_labelSet_height_range(df, d, keyList):
|
621 |
+
print("search_labelSet_height_range")
|
622 |
+
keyDict = dict.fromkeys(keyList, []) #stores the required information as dictonary, then coverted to df
|
623 |
+
print("Dataframe from extraction is going to happen: ")
|
624 |
+
|
625 |
+
for i in range(df.shape[0]): # search df for right-bottom y axis value and check if it lies within the range d.
|
626 |
+
box = df['bbox_column'].iloc[i]
|
627 |
+
if dist_height(box[1], d)<50:
|
628 |
+
key = df['preds_column'].iloc[i]
|
629 |
+
keyDict[key] = df['scr_column'].iloc[i]
|
630 |
+
return keyDict
|
631 |
+
|
632 |
+
|
633 |
+
def clean_colText(df, column):
|
634 |
+
for i in range(df.shape[0]):
|
635 |
+
df[column].iloc[i] = df[column].iloc[i].replace('[', '').replace('|', '').replace('+', '')
|
636 |
+
return df
|
637 |
+
|
638 |
+
|
639 |
+
def find_repeatingLabels(df, labels_repeating):
|
640 |
+
print("In find_repeatingLabels: ")
|
641 |
+
row2drop = [] # dropping the rows that have been covered in previous dataframe
|
642 |
+
for i in range(df.shape[0]):
|
643 |
+
df['preds_column'].iloc[i] = id2label_row(df['preds_column'].iloc[i], id2label)
|
644 |
+
if df['preds_column'].iloc[i] not in labels_repeating:
|
645 |
+
row2drop.append(i)
|
646 |
+
df.drop(index = row2drop, inplace = True)
|
647 |
+
df = clean_colText(df, 'scr_column')
|
648 |
+
|
649 |
+
print("removing non-tabular labels.")
|
650 |
+
|
651 |
+
df = mergeLabelsExtensive_repeating(df,labels_repeating)
|
652 |
+
print('after merging non-tabular labels: ')
|
653 |
+
|
654 |
+
labels_repeating = list(set(list(df["preds_column"])))
|
655 |
+
print("labels_repeating in this document are: ",labels_repeating)
|
656 |
+
# adding extra column that contains the Y-axis information (Height)
|
657 |
+
df['y_axis'] = np.NaN
|
658 |
+
for i in range(df.shape[0]):
|
659 |
+
box = df['bbox_column'].iloc[i]
|
660 |
+
df['y_axis'].iloc[i] = box[1]
|
661 |
+
|
662 |
+
print("After adding y-axis data in the dataframes: ")
|
663 |
+
df = mergeLabelsExtensive(df)
|
664 |
+
print("aftermerging the df extensively")
|
665 |
+
print("Grouping the labels wrt heights: ")
|
666 |
+
grouped_df = group_labels_wrt_height(df)
|
667 |
+
|
668 |
+
#once labels are grouped, now we will create dictionaries for labels and values occuring in single line
|
669 |
+
row_dicts = [] # will contains each row of df as single dictionary.
|
670 |
+
for _, row in grouped_df.iterrows():
|
671 |
+
row_dict = {}
|
672 |
+
for preds, scr in zip(row['preds_column'], row['scr_column']):
|
673 |
+
row_dict[preds] = scr
|
674 |
+
row_dicts.append(row_dict)
|
675 |
+
|
676 |
+
#creating new
|
677 |
+
final_df = pd.DataFrame(columns=labels_repeating)
|
678 |
+
for d in row_dicts:
|
679 |
+
final_df = final_df.append(d, ignore_index=True)
|
680 |
+
final_df = final_df.fillna('')
|
681 |
+
return final_df
|
682 |
+
|
683 |
+
|
684 |
+
def mergeImageVertical(images):
|
685 |
+
# pick the image which is the smallest, and resize the others to match it (can be arbitrary image shape here)
|
686 |
+
min_shape = sorted( [(np.sum(i.size), i.size ) for i in images])[0][1]
|
687 |
+
imgs_comb = np.hstack([i.resize(min_shape) for i in images])
|
688 |
+
# for a vertical stacking it is simple: use vstack
|
689 |
+
imgs_comb = np.vstack([i.resize(min_shape) for i in images])
|
690 |
+
imgs_comb = Image.fromarray(imgs_comb)
|
691 |
+
return imgs_comb
|
692 |
+
|
693 |
+
def perform_erosion(img):
|
694 |
+
# Check if the image is already in grayscale
|
695 |
+
if len(img.shape) == 2:
|
696 |
+
gray = img
|
697 |
+
else:
|
698 |
+
# Convert the image to grayscale
|
699 |
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
700 |
+
|
701 |
+
# Define the kernel for erosion and dilation
|
702 |
+
kernel = np.ones((3, 3), np.uint8)
|
703 |
+
|
704 |
+
# Perform erosion followed by dilation
|
705 |
+
erosion = cv2.erode(gray, kernel, iterations=1)
|
706 |
+
dilation = cv2.dilate(erosion, kernel, iterations=1)
|
707 |
+
|
708 |
+
# Double the size of the image
|
709 |
+
double_size = cv2.resize(gray, None, fx=2, fy=2, interpolation=cv2.INTER_LINEAR)
|
710 |
+
|
711 |
+
# Perform erosion on the doubled image
|
712 |
+
double_erosion = cv2.erode(double_size, kernel, iterations=1)
|
713 |
+
|
714 |
+
return double_erosion
|
715 |
+
|
716 |
+
|
717 |
+
|
718 |
+
def remove_leading_trailing_special_characters(input_string):
|
719 |
+
cleaned_string = re.sub(r'^[^A-Za-z0-9]+|[^A-Za-z0-9]+$', '', str(input_string))
|
720 |
+
return cleaned_string
|
721 |
+
|
722 |
+
def clean_dataframe(df):
|
723 |
+
# Apply the remove_leading_trailing_special_characters function to all string columns
|
724 |
+
for column in df.select_dtypes(include='object').columns:
|
725 |
+
df[column] = df[column].apply(remove_leading_trailing_special_characters)
|
726 |
+
|
727 |
+
# Remove rows with all NaN or blank values
|
728 |
+
df = df.fillna('') # Replace NaN values with blank
|
729 |
+
return df
|
730 |
+
|
731 |
+
def mergeLabelsExtensive(df_grouped):
|
732 |
+
i = 0
|
733 |
+
while i < df_grouped.shape[0]:
|
734 |
+
merge_labels = [i] # collects indices whose data has been merged, so we need to delete it now.
|
735 |
+
label = df_grouped['preds_column'].iloc[i]
|
736 |
+
box1 = df_grouped['bbox_column'].iloc[i]
|
737 |
+
|
738 |
+
for j in range(i+1, df_grouped.shape[0]):
|
739 |
+
box2 = df_grouped['bbox_column'].iloc[j]
|
740 |
+
if label == df_grouped['preds_column'].iloc[j] and dist_height(box1[3], box2[3])<20: # which are in the vicinity of 20 pixels.
|
741 |
+
merge_labels.append(j)
|
742 |
+
print_df(df_grouped)
|
743 |
+
df_grouped = transform_dataset(df_grouped, merge_labels)
|
744 |
+
i = i+1
|
745 |
+
return df_grouped
|
746 |
+
|
747 |
+
def multilabelsHandle(df, thermo_details):
|
748 |
+
# Since 0 is assigned to 'others' and these values are not so important. We delete these values.
|
749 |
+
df = df[df.preds_column != 0]
|
750 |
+
df.reset_index(drop=True, inplace=True)
|
751 |
+
for i in range(df.shape[0]):
|
752 |
+
df['preds_column'].iloc[i] = id2label.get(df['preds_column'].iloc[i])
|
753 |
+
df['preds_column'].unique()
|
754 |
+
df_grouped = df.copy() #stores the index of relevant labels.
|
755 |
+
df_grouped.shape[0]
|
756 |
+
for i in range(df.shape[0]):
|
757 |
+
if df['preds_column'].iloc[i] not in thermo_details:
|
758 |
+
df_grouped.drop(i, inplace = True)
|
759 |
+
df_grouped.reset_index(drop=True, inplace=True)
|
760 |
+
|
761 |
+
keyList = df_grouped['preds_column'].unique()
|
762 |
+
df_grouped = mergeLabelsExtensive(df_grouped)
|
763 |
+
|
764 |
+
# extract the height of boxes
|
765 |
+
df_grouped = extract_yaxis(df_grouped)
|
766 |
+
shipment_labels = ['delivery_name','delivery_address','contact_phone']
|
767 |
+
# shipment
|
768 |
+
heights_shipment = get_heights(df_grouped, shipment_labels)
|
769 |
+
|
770 |
+
# now segregating the other repeating values in df like measiure, weight, volume etc.
|
771 |
+
# they will be containeed within the heights, as they act as boudaries.
|
772 |
+
df_labelSet = pd.DataFrame(columns= thermo_details)
|
773 |
+
for i in range(len(heights_shipment)):
|
774 |
+
if i == len(heights_shipment)-1:
|
775 |
+
new_df = search_labelSet_between_h1_h2(df_grouped, heights_shipment[i], 5000, keyList)
|
776 |
+
else:
|
777 |
+
new_df = search_labelSet_between_h1_h2(df_grouped, heights_shipment[i], heights_shipment[i+1], keyList)
|
778 |
+
df_labelSet = df_labelSet.append(new_df, ignore_index=True)
|
779 |
+
return df_labelSet
|
780 |
+
|
781 |
+
|
782 |
+
def completepreprocess(pdffile,ocr_type):
|
783 |
+
myDataFrame = pd.DataFrame()
|
784 |
+
myDataFrame2 = pd.DataFrame()
|
785 |
+
merge_pages=[]
|
786 |
+
doc = fitz.open(pdffile)
|
787 |
+
for i in range(0, len(doc)):
|
788 |
+
page = doc.load_page(i)
|
789 |
+
zoom = 2
|
790 |
+
mat = fitz.Matrix(zoom, zoom)
|
791 |
+
pix = page.get_pixmap(matrix = mat, dpi = 300)
|
792 |
+
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
793 |
+
ro_image = rotate_image(image)
|
794 |
+
if ro_image is None:
|
795 |
+
return None
|
796 |
+
angle, skewed_image = correct_skew(ro_image)
|
797 |
+
if skewed_image is None:
|
798 |
+
return None
|
799 |
+
remove_border = removeBorders(skewed_image)
|
800 |
+
image = Image.fromarray(remove_border)
|
801 |
+
width,height=image.size
|
802 |
+
label2color = generate_unique_colors(id2label)
|
803 |
+
width,height=image.size
|
804 |
+
if ocr_type == "GoogleVisionOCR":
|
805 |
+
words, boxes = process_image_GoogleVision(image, width, height)
|
806 |
+
else:
|
807 |
+
words, boxes = process_image_pytesseract(image, width, height)
|
808 |
+
|
809 |
+
bbox, preds, words, image = process_image_encoding(model, processor, image, words, boxes,width,height)
|
810 |
+
im, df_visualize = visualize_image(bbox, preds, words, image,label2color)
|
811 |
+
df_main = process_form_(df_visualize)
|
812 |
+
|
813 |
+
bbox_column = bbox
|
814 |
+
preds_column = preds
|
815 |
+
scr_column = words
|
816 |
+
|
817 |
+
# dictionary of lists
|
818 |
+
dict = {'bbox_column': bbox_column, 'preds_column': preds_column, 'scr_column': scr_column}
|
819 |
+
df_single_page = pd.DataFrame(dict)
|
820 |
+
labels_repeating = ['art_code', 'ref_code', 'detail_desc','lot_id','detail_qty','detail_um','detail_tare','detail_grossw','detail_netw','detail_origin','varieta','raccolta']
|
821 |
+
df_repeating_page = find_repeatingLabels(df_single_page, labels_repeating)
|
822 |
+
myDataFrame2= myDataFrame2.append(df_repeating_page,sort=False)
|
823 |
+
|
824 |
+
df1=add_dataframe(df_main,labels_repeating,label2color).astype(str)
|
825 |
+
myDataFrame= myDataFrame.append(df1,sort=False).reset_index(drop = True)
|
826 |
+
myDataFrame['value'].apply(len)
|
827 |
+
row2drop = []
|
828 |
+
for i in range(myDataFrame.shape[0]):
|
829 |
+
if len( myDataFrame['value'].iloc[i]) ==0:
|
830 |
+
row2drop.append(i)
|
831 |
+
myDataFrame.drop(index = row2drop, inplace = True)
|
832 |
+
myDataFrame.reset_index(drop = True, inplace = True)
|
833 |
+
myDataFrame = myDataFrame[myDataFrame["value"].notnull()]
|
834 |
+
myDataFrame.drop_duplicates(subset=["key"],inplace=True)
|
835 |
+
myDataFrame2 = myDataFrame2.loc[:, ~(myDataFrame2.apply(lambda x: all(isinstance(val, list) and len(val) == 0 for val in x)))]
|
836 |
+
merge_pages.append(im)
|
837 |
+
im2=mergeImageVertical(merge_pages)
|
838 |
+
myDataFrame2 = clean_dataframe(myDataFrame2)
|
839 |
+
myDataFrame = clean_dataframe(myDataFrame)
|
840 |
+
myDataFrame = myDataFrame[myDataFrame['key'] != 'others']
|
841 |
+
output_excel_path = createExcel(myDataFrame, myDataFrame2, pdffile.name)
|
842 |
+
return im2,myDataFrame,myDataFrame2,output_excel_path
|
843 |
+
|
844 |
+
|
845 |
+
title = "Interactive demo: Transport Document Information Extraction model PDF/Images"
|
846 |
+
description = "Results will show up in a few seconds. This model is trained on only 1326 Images whereas 226 images are used for testing purposes. The annotated image can be opened in a new window for a better view."
|
847 |
+
|
848 |
+
css = """.output_image, .input_image {height: 600px !important}"""
|
849 |
+
#examples = [ ]
|
850 |
+
|
851 |
+
iface = gr.Interface(
|
852 |
+
fn=completepreprocess,
|
853 |
+
inputs=[
|
854 |
+
gr.components.File(label="PDF"),
|
855 |
+
gr.components.Dropdown(label="Select the OCR", choices=["Pytesseract","GoogleVisionOCR"]),
|
856 |
+
],
|
857 |
+
outputs=[
|
858 |
+
gr.components.Image(type="pil", label="annotated image"),
|
859 |
+
"dataframe",
|
860 |
+
"dataframe",
|
861 |
+
gr.File(label="Excel output")
|
862 |
+
],
|
863 |
+
title=title,
|
864 |
+
description=description,
|
865 |
+
examples=examples,
|
866 |
+
css=css
|
867 |
+
)
|
868 |
+
|
869 |
+
iface.launch(inline=True, debug=True)
|