Spaces:

PaddlePaddle
/

UIE-X

Runtime error

File size: 12,716 Bytes

#-*- coding: UTF-8 -*-
# Copyright 2022 the HuggingFace Team.
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import traceback
import base64

import gradio as gr
import cv2

from paddlenlp import Taskflow
from paddlenlp.utils.doc_parser import DocParser


doc_parser = DocParser()
task_instance = Taskflow(
    "information_extraction",
    model="uie-x-base",
    task_path="PaddlePaddle/uie-x-base",
    from_hf_hub=True)

examples = [
    [
        "business_card.png",
        "Name;Title;Web Link;Email;Address",
    ],
    [
        "license.jpeg",
        "Name;DOB;ISS;EXP",
    ],
    [
        "statements.png",
        "Date|Gross profit",
    ],    
    [
        "invoice.jpeg",
        "名称;纳税人识别号;开票日期",
    ],
    [
        "custom.jpeg",
        "收发货人;进口口岸;进口日期;运输方式;征免性质;境内目的地;运输工具名称;包装种类;件数;合同协议号"
    ],
    [
        "resume.png",
        "职位;年龄;学校|时间;学校|专业",
    ],
]

example_files = {
    "Name;Title;Web Link;Email;Address": "business_card.png",
    "Name;DOB;ISS;EXP": "license.jpeg",
    "Date|Gross profit": "statements.png",
    "职位;年龄;学校|时间;学校|专业": "resume.png",
    "收发货人;进口口岸;进口日期;运输方式;征免性质;境内目的地;运输工具名称;包装种类;件数;合同协议号": "custom.jpeg",
    "名称;纳税人识别号;开票日期": "invoice.jpeg",
}

lang_map = {
    "resume.png": "ch",
    "custom.jpeg": "ch",
    "business_card.png": "en",
    "invoice.jpeg": "ch",
    "license.jpeg": "en",
    "statements.png": "en",
}

def dbc2sbc(s):
    rs = ""
    for char in s:
        code = ord(char)
        if code == 0x3000:
            code = 0x0020
        else:
            code -= 0xfee0
        if not (0x0021 <= code and code <= 0x7e):
            rs += char
            continue
        rs += chr(code)
    return rs


def np2base64(image_np):
    image = cv2.imencode('.jpg', image_np)[1]
    base64_str = str(base64.b64encode(image))[2:-1]
    return base64_str


def process_path(path):
    error = None
    if path:
        try:
            if path.endswith(".pdf"):
                images_list = [doc_parser.read_pdf(path)]
            else:
                images_list = [doc_parser.read_image(path)]
            return (
                path,
                gr.update(visible=True, value=images_list),
                gr.update(visible=True),
                gr.update(visible=False, value=None),
                gr.update(visible=False, value=None),
                None,
            )
        except Exception as e:
            traceback.print_exc()
            error = str(e)
    return (
        None,
        gr.update(visible=False, value=None),
        gr.update(visible=False),
        gr.update(visible=False, value=None),
        gr.update(visible=False, value=None),
        gr.update(visible=True, value=error) if error is not None else None,
        None,
    )


def process_upload(file):
    if file:
        return process_path(file.name)
    else:
        return (
            None,
            gr.update(visible=False, value=None),
            gr.update(visible=False),
            gr.update(visible=False, value=None),
            gr.update(visible=False, value=None),
            None,
        )

def get_schema(schema_str):
    def _is_ch(s):
        for ch in s:
            if "\u4e00" <= ch <= "\u9fff":
                return True
        return False
    schema_lang = "ch" if _is_ch(schema_str) else "en"
    schema = schema_str.split(";")
    schema_list = []
    for s in schema:
        cand = s.split("|")
        if len(cand) == 1:
            schema_list.append(cand[0])
        else:
            subject = cand[0]
            relations = cand[1:]
            added = False
            for a in schema_list:
                if isinstance(a, dict):
                    if subject in a.keys():
                        a[subject].extend(relations)
                        added = True
                        break
            if not added:
                a = {subject: relations}
                schema_list.append(a)
    return schema_list, schema_lang


def run_taskflow(document, schema, argument):
    task_instance.set_schema(schema)
    task_instance.set_argument(argument)
    return task_instance({'doc': document})


def process_doc(document, schema, ocr_lang, layout_analysis):
    if [document, schema] in examples:
        ocr_lang = lang_map[document]

    if not schema:
        schema = '时间;组织机构;人物'
    if document is None:
        return None, None

    layout_analysis = True if layout_analysis == "yes" else False
    schema, schema_lang = get_schema(dbc2sbc(schema))
    argument = {
        "ocr_lang": ocr_lang,
        "schema_lang": schema_lang,
        "layout_analysis": layout_analysis
    }
    prediction = run_taskflow(document, schema, argument)[0]

    if document.endswith(".pdf"):
        _image = doc_parser.read_pdf(document)
    else:
        _image = doc_parser.read_image(document)

    img_show = doc_parser.write_image_with_results(
        np2base64(_image),
        result=prediction,
        return_image=True)
    img_list = [img_show]

    return (
        gr.update(visible=True, value=img_list),
        gr.update(visible=True, value=prediction),
    )


def load_example_document(img, schema, ocr_lang, layout_analysis):
    if img is not None:
        document = example_files[schema]
        choice = lang_map[document].split("-")
        ocr_lang = choice[0]
        preview, answer = process_doc(document, schema, ocr_lang, layout_analysis)
        return document, schema, preview, gr.update(visible=True), answer
    else:
        return None, None, None, gr.update(visible=False), None


def read_content(file_path: str) -> str:
    """read the content of target file
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    return content


with gr.Blocks() as demo:
    gr.HTML(read_content("header.html"))
    gr.Markdown(
        "Open-sourced by PaddleNLP, **UIE-X** is a universal information extraction engine for both scanned document and text inputs. It supports Entity Extraction, Relation Extraction and Event Extraction tasks."
        "UIE-X performs well on a zero-shot settings, which is enabled by a flexible schema that allows you to specify extraction targets with simple natural language."
        "Moreover, on PaddleNLP, we provide a comprehensive and easy-to-use fine-tuning and few-shot customization workflow." 
        "For more details, please visit our [GitHub](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/information_extraction/README_en.md)"
    )

    document = gr.Variable()
    is_text = gr.Variable()
    example_schema = gr.Textbox(visible=False)
    example_image = gr.Image(visible=False)
    with gr.Row(equal_height=True):
        with gr.Column():
            with gr.Row():
                gr.Markdown("## 1. Select a file 选择文件", elem_id="select-a-file")
                img_clear_button = gr.Button(
                    "Clear", variant="secondary", elem_id="file-clear", visible=False
                )
            image = gr.Gallery(visible=False)
            with gr.Row(equal_height=True):
                with gr.Column():
                    with gr.Row():
                        url = gr.Textbox(
                            show_label=False,
                            placeholder="URL",
                            lines=1,
                            max_lines=1,
                            elem_id="url-textbox",
                        )
                        submit = gr.Button("Get")
                    url_error = gr.Textbox(
                        visible=False,
                        elem_id="url-error",
                        max_lines=1,
                        interactive=False,
                        label="Error",
                    )
            gr.Markdown("## <center> — or — </center>")
            upload = gr.File(label=None, interactive=True, elem_id="short-upload-box")
            gr.Examples(
                examples=examples,
                inputs=[example_image, example_schema],
            )

        with gr.Column():
            gr.Markdown("## 2. Information Extraction 信息抽取 ")
            gr.Markdown("### 👉 Set a schema 设置schema")
            gr.Markdown("Entity extraction: entity type should be separated by ';', e.g. **Person;Organization**")
            gr.Markdown("实体抽取：实体类别之间以';'分割，例如 **人物；组织机构**")
            gr.Markdown("Relation extraction: set the subject and relation type, separated by '|', e.g. **Person|Date;Person|Email**")
            gr.Markdown("关系抽取：需配置主体和关系类别，中间以'|'分割，例如 **人物|出生时间；人物|邮箱**")
            gr.Markdown("### 👉 Model customization 模型定制")
            gr.Markdown("We recommend to further improve the extraction performance in specific domain through the process of [data annotation & fine-tuning](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/information_extraction/document/README_en.md)")
            gr.Markdown("我们建议通过[数据标注+微调](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/information_extraction/document/README_en.md)的流程进一步增强模型在特定场景的效果")

            schema = gr.Textbox(
                label="Schema",
                placeholder="e.g. Name|Company;Name|Position;Email;Phone Number",
                lines=1,
                max_lines=1,
            )

            ocr_lang = gr.Radio(
                choices=["ch", "en"],
                value="en",
                label="OCR语言 / OCR Language (Please choose ch for Chinese images.)",
            )

            layout_analysis = gr.Radio(
                choices=["yes", "no"],
                value="no",
                label="版面分析 / Layout analysis (Better extraction for multi-line text)",
            )

            with gr.Row():
                clear_button = gr.Button("Clear", variant="secondary")
                submit_button = gr.Button(
                    "Submit", variant="primary", elem_id="submit-button"
                )          
            with gr.Column():
                output = gr.JSON(label="Output", visible=False)

    for cb in [img_clear_button, clear_button]:
        cb.click(
            lambda _: (
                gr.update(visible=False, value=None),
                None,
                gr.update(visible=False, value=None),
                gr.update(visible=False),
                None,
                None,
                None,
                gr.update(visible=False, value=None),
                None,
            ),
            inputs=clear_button,
            outputs=[
                image,
                document,
                output,
                img_clear_button,
                example_image,
                upload,
                url,
                url_error,
                schema,
            ],
        )

    upload.change(
        fn=process_upload,
        inputs=[upload],
        outputs=[document, image, img_clear_button, output, url_error],
    )
    submit.click(
        fn=process_path,
        inputs=[url],
        outputs=[document, image, img_clear_button, output, url_error],
    )

    schema.submit(
        fn=process_doc,
        inputs=[document, schema, ocr_lang, layout_analysis],
        outputs=[image, output],
    )

    submit_button.click(
        fn=process_doc,
        inputs=[document, schema, ocr_lang, layout_analysis],
        outputs=[image, output],
    )

    example_image.change(
        fn=load_example_document,
        inputs=[example_image, example_schema, ocr_lang, layout_analysis],
        outputs=[document, schema, image, img_clear_button, output],
    )

    gr.HTML(read_content("footer.html"))


if __name__ == "__main__":
    demo.launch(enable_queue=False)