File size: 5,549 Bytes
1a33e6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8dcdb27
1a33e6d
 
 
8dcdb27
 
1a33e6d
 
7c1ac46
 
 
 
 
 
 
1a33e6d
 
 
5d50c04
1a33e6d
 
 
 
 
 
 
7c1ac46
 
 
1a33e6d
 
 
 
 
 
 
 
 
7c1ac46
 
 
 
 
 
 
 
 
 
 
 
1a33e6d
 
 
 
 
 
 
7c1ac46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a33e6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7c1ac46
 
 
1a33e6d
 
 
 
 
 
 
 
 
 
 
7c1ac46
 
 
1a33e6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import os

os.system('pip install pip --upgrade')
os.system('pip install -q git+https://github.com/huggingface/transformers.git')


os.system("pip install pyyaml==5.1")
# workaround: install old version of pytorch since detectron2 hasn't released packages for pytorch 1.9 (issue: https://github.com/facebookresearch/detectron2/issues/3158)
os.system(
    "pip install torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html"
)

# install detectron2 that matches pytorch 1.8
# See https://detectron2.readthedocs.io/tutorials/install.html for instructions
os.system(
    "pip install -q detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html"
)

## install PyTesseract
os.system("pip install -q pytesseract")

import gradio as gr
import numpy as np
from transformers import LayoutLMv3Processor, LiltForTokenClassification
from datasets import load_dataset
from PIL import Image, ImageDraw, ImageFont

processor = LiltForTokenClassification.from_pretrained("SCUT-DLVCLab/lilt-roberta-en-base")
model = LayoutLMv3Processor.from_pretrained(
    "jinhybr/LiLt-funsd-en"
)
####





####

# load image example
dataset = load_dataset("nielsr/funsd-layoutlmv3", split="test")
image = Image.open(dataset[0]["image"]).convert("RGB")
image = Image.open("./example_lm3.png")
image.save("document.png")

labels = dataset.features["ner_tags"].feature.names
id2label = {v: k for v, k in enumerate(labels)}




# helper function to unnormalize bboxes for drawing onto the image
def unnormalize_box(bbox, width, height):
    return [
        width * (bbox[0] / 1000),
        height * (bbox[1] / 1000),
        width * (bbox[2] / 1000),
        height * (bbox[3] / 1000),
    ]


label2color = {
    "B-HEADER": "blue",
    "B-QUESTION": "red",
    "B-ANSWER": "green",
    "I-HEADER": "blue",
    "I-QUESTION": "red",
    "I-ANSWER": "green",
}




def iob_to_label(label):
    label = label[2:]
    if not label:
        return "other"
    return label




# draw results onto the image
def draw_boxes(image, boxes, predictions):
    width, height = image.size
    normalizes_boxes = [unnormalize_box(box, width, height) for box in boxes]

    # draw predictions over the image
    draw = ImageDraw.Draw(image)
    font = ImageFont.load_default()
    for prediction, box in zip(predictions, normalizes_boxes):
        if prediction == "O":
            continue
        draw.rectangle(box, outline="black")
        draw.rectangle(box, outline=label2color[prediction])
        draw.text((box[0] + 10, box[1] - 10), text=prediction, fill=label2color[prediction], font=font)
    return image




def process_image(image):
    width, height = image.size

    # encode
    encoding = processor(
        image, truncation=True, return_offsets_mapping=True, return_tensors="pt"
    )
    offset_mapping = encoding.pop("offset_mapping")

    # forward pass
    outputs = model(**encoding)

    # get predictions
    predictions = outputs.logits.argmax(-1).squeeze().tolist()
    token_boxes = encoding.bbox.squeeze().tolist()

    # only keep non-subword predictions
    is_subword = np.array(offset_mapping.squeeze().tolist())[:, 0] != 0
    true_predictions = [
        id2label[pred] for idx, pred in enumerate(predictions) if not is_subword[idx]
    ]
    true_boxes = [
        unnormalize_box(box, width, height)
        for idx, box in enumerate(token_boxes)
        if not is_subword[idx]
    ]

    draw_boxes(image, true_boxes, true_predictions)

''''    # draw predictions over the image
    draw = ImageDraw.Draw(image)
    font = ImageFont.load_default()
    for prediction, box in zip(true_predictions, true_boxes):
        predicted_label = iob_to_label(prediction).lower()
        draw.rectangle(box, outline=label2color[predicted_label])
        draw.text(
            (box[0] + 10, box[1] - 10),
            text=predicted_label,
            fill=label2color[predicted_label],
            font=font,
        )
''''



    return image


title = "OCR Document Parser : Information Extraction - Fine Tuned LiLT Language-independent Layout Transformer Model"
description = "Demo for  LiLT Language-independent Layout Transformer, a Transformer for state-of-the-art document image understanding tasks. This particular model is fine-tuned on FUNSD, a dataset of manually annotated forms. It annotates the words appearing in the image as QUESTION/ANSWER/HEADER/OTHER. To use it, simply upload an image or use the example image below and click 'Submit'. Results will show up in a few seconds. If you want to make the output bigger, right-click on it and select 'Open image in new tab'."
article = "<p style='text-align: center'><a href=' https://arxiv.org/abs/2202.13669' target='_blank'> LiLT Language-independent Layout Transformer</a> | <a href='https://github.com/jpwang/lilt' target='_blank'>Github Repo</a></p>"
examples = [["document.png"]]

css = ".output-image, .input-image {height: 40rem !important; width: 100% !important;}"
# css = "@media screen and (max-width: 600px) { .output_image, .input_image {height:20rem !important; width: 100% !important;} }"
# css = ".output_image, .input_image {height: 600px !important}"

css = ".image-preview {height: auto !important;}"

iface = gr.Interface(
    fn=process_image,
    inputs=gr.inputs.Image(type="pil"),
    outputs=gr.outputs.Image(type="pil", label="annotated image"),
    title=title,
    description=description,
    article=article,
    examples=examples,
    css=css,
    enable_queue=True,
)
iface.launch(debug=True)