Spaces:
Runtime error
Runtime error
File size: 12,716 Bytes
2623e99 561f457 2623e99 01bf385 2623e99 01bf385 2623e99 561f457 2623e99 11ba4ce 2623e99 1cd0594 bf9a3a8 2623e99 b35eb97 17ed041 2623e99 17ed041 2623e99 561f457 17ed041 2623e99 01bf385 2623e99 cee9c65 2623e99 561f457 d0dc5bb 561f457 d0dc5bb 561f457 2623e99 b35eb97 2623e99 b35eb97 561f457 2623e99 01bf385 2623e99 01bf385 561f457 2623e99 561f457 2623e99 561f457 2623e99 b35eb97 2623e99 767c35f d0dc5bb fa85331 2623e99 d0dc5bb 2623e99 ea9880c 2623e99 d0dc5bb 8bf7635 d0dc5bb 2623e99 d095e6e 2623e99 8bf7635 fa85331 2623e99 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 |
#-*- coding: UTF-8 -*-
# Copyright 2022 the HuggingFace Team.
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import traceback
import base64
import gradio as gr
import cv2
from paddlenlp import Taskflow
from paddlenlp.utils.doc_parser import DocParser
doc_parser = DocParser()
task_instance = Taskflow(
"information_extraction",
model="uie-x-base",
task_path="PaddlePaddle/uie-x-base",
from_hf_hub=True)
examples = [
[
"business_card.png",
"Name;Title;Web Link;Email;Address",
],
[
"license.jpeg",
"Name;DOB;ISS;EXP",
],
[
"statements.png",
"Date|Gross profit",
],
[
"invoice.jpeg",
"名称;纳税人识别号;开票日期",
],
[
"custom.jpeg",
"收发货人;进口口岸;进口日期;运输方式;征免性质;境内目的地;运输工具名称;包装种类;件数;合同协议号"
],
[
"resume.png",
"职位;年龄;学校|时间;学校|专业",
],
]
example_files = {
"Name;Title;Web Link;Email;Address": "business_card.png",
"Name;DOB;ISS;EXP": "license.jpeg",
"Date|Gross profit": "statements.png",
"职位;年龄;学校|时间;学校|专业": "resume.png",
"收发货人;进口口岸;进口日期;运输方式;征免性质;境内目的地;运输工具名称;包装种类;件数;合同协议号": "custom.jpeg",
"名称;纳税人识别号;开票日期": "invoice.jpeg",
}
lang_map = {
"resume.png": "ch",
"custom.jpeg": "ch",
"business_card.png": "en",
"invoice.jpeg": "ch",
"license.jpeg": "en",
"statements.png": "en",
}
def dbc2sbc(s):
rs = ""
for char in s:
code = ord(char)
if code == 0x3000:
code = 0x0020
else:
code -= 0xfee0
if not (0x0021 <= code and code <= 0x7e):
rs += char
continue
rs += chr(code)
return rs
def np2base64(image_np):
image = cv2.imencode('.jpg', image_np)[1]
base64_str = str(base64.b64encode(image))[2:-1]
return base64_str
def process_path(path):
error = None
if path:
try:
if path.endswith(".pdf"):
images_list = [doc_parser.read_pdf(path)]
else:
images_list = [doc_parser.read_image(path)]
return (
path,
gr.update(visible=True, value=images_list),
gr.update(visible=True),
gr.update(visible=False, value=None),
gr.update(visible=False, value=None),
None,
)
except Exception as e:
traceback.print_exc()
error = str(e)
return (
None,
gr.update(visible=False, value=None),
gr.update(visible=False),
gr.update(visible=False, value=None),
gr.update(visible=False, value=None),
gr.update(visible=True, value=error) if error is not None else None,
None,
)
def process_upload(file):
if file:
return process_path(file.name)
else:
return (
None,
gr.update(visible=False, value=None),
gr.update(visible=False),
gr.update(visible=False, value=None),
gr.update(visible=False, value=None),
None,
)
def get_schema(schema_str):
def _is_ch(s):
for ch in s:
if "\u4e00" <= ch <= "\u9fff":
return True
return False
schema_lang = "ch" if _is_ch(schema_str) else "en"
schema = schema_str.split(";")
schema_list = []
for s in schema:
cand = s.split("|")
if len(cand) == 1:
schema_list.append(cand[0])
else:
subject = cand[0]
relations = cand[1:]
added = False
for a in schema_list:
if isinstance(a, dict):
if subject in a.keys():
a[subject].extend(relations)
added = True
break
if not added:
a = {subject: relations}
schema_list.append(a)
return schema_list, schema_lang
def run_taskflow(document, schema, argument):
task_instance.set_schema(schema)
task_instance.set_argument(argument)
return task_instance({'doc': document})
def process_doc(document, schema, ocr_lang, layout_analysis):
if [document, schema] in examples:
ocr_lang = lang_map[document]
if not schema:
schema = '时间;组织机构;人物'
if document is None:
return None, None
layout_analysis = True if layout_analysis == "yes" else False
schema, schema_lang = get_schema(dbc2sbc(schema))
argument = {
"ocr_lang": ocr_lang,
"schema_lang": schema_lang,
"layout_analysis": layout_analysis
}
prediction = run_taskflow(document, schema, argument)[0]
if document.endswith(".pdf"):
_image = doc_parser.read_pdf(document)
else:
_image = doc_parser.read_image(document)
img_show = doc_parser.write_image_with_results(
np2base64(_image),
result=prediction,
return_image=True)
img_list = [img_show]
return (
gr.update(visible=True, value=img_list),
gr.update(visible=True, value=prediction),
)
def load_example_document(img, schema, ocr_lang, layout_analysis):
if img is not None:
document = example_files[schema]
choice = lang_map[document].split("-")
ocr_lang = choice[0]
preview, answer = process_doc(document, schema, ocr_lang, layout_analysis)
return document, schema, preview, gr.update(visible=True), answer
else:
return None, None, None, gr.update(visible=False), None
def read_content(file_path: str) -> str:
"""read the content of target file
"""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
return content
with gr.Blocks() as demo:
gr.HTML(read_content("header.html"))
gr.Markdown(
"Open-sourced by PaddleNLP, **UIE-X** is a universal information extraction engine for both scanned document and text inputs. It supports Entity Extraction, Relation Extraction and Event Extraction tasks."
"UIE-X performs well on a zero-shot settings, which is enabled by a flexible schema that allows you to specify extraction targets with simple natural language."
"Moreover, on PaddleNLP, we provide a comprehensive and easy-to-use fine-tuning and few-shot customization workflow."
"For more details, please visit our [GitHub](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/information_extraction/README_en.md)"
)
document = gr.Variable()
is_text = gr.Variable()
example_schema = gr.Textbox(visible=False)
example_image = gr.Image(visible=False)
with gr.Row(equal_height=True):
with gr.Column():
with gr.Row():
gr.Markdown("## 1. Select a file 选择文件", elem_id="select-a-file")
img_clear_button = gr.Button(
"Clear", variant="secondary", elem_id="file-clear", visible=False
)
image = gr.Gallery(visible=False)
with gr.Row(equal_height=True):
with gr.Column():
with gr.Row():
url = gr.Textbox(
show_label=False,
placeholder="URL",
lines=1,
max_lines=1,
elem_id="url-textbox",
)
submit = gr.Button("Get")
url_error = gr.Textbox(
visible=False,
elem_id="url-error",
max_lines=1,
interactive=False,
label="Error",
)
gr.Markdown("## <center> — or — </center>")
upload = gr.File(label=None, interactive=True, elem_id="short-upload-box")
gr.Examples(
examples=examples,
inputs=[example_image, example_schema],
)
with gr.Column():
gr.Markdown("## 2. Information Extraction 信息抽取 ")
gr.Markdown("### 👉 Set a schema 设置schema")
gr.Markdown("Entity extraction: entity type should be separated by ';', e.g. **Person;Organization**")
gr.Markdown("实体抽取:实体类别之间以';'分割,例如 **人物;组织机构**")
gr.Markdown("Relation extraction: set the subject and relation type, separated by '|', e.g. **Person|Date;Person|Email**")
gr.Markdown("关系抽取:需配置主体和关系类别,中间以'|'分割,例如 **人物|出生时间;人物|邮箱**")
gr.Markdown("### 👉 Model customization 模型定制")
gr.Markdown("We recommend to further improve the extraction performance in specific domain through the process of [data annotation & fine-tuning](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/information_extraction/document/README_en.md)")
gr.Markdown("我们建议通过[数据标注+微调](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/information_extraction/document/README_en.md)的流程进一步增强模型在特定场景的效果")
schema = gr.Textbox(
label="Schema",
placeholder="e.g. Name|Company;Name|Position;Email;Phone Number",
lines=1,
max_lines=1,
)
ocr_lang = gr.Radio(
choices=["ch", "en"],
value="en",
label="OCR语言 / OCR Language (Please choose ch for Chinese images.)",
)
layout_analysis = gr.Radio(
choices=["yes", "no"],
value="no",
label="版面分析 / Layout analysis (Better extraction for multi-line text)",
)
with gr.Row():
clear_button = gr.Button("Clear", variant="secondary")
submit_button = gr.Button(
"Submit", variant="primary", elem_id="submit-button"
)
with gr.Column():
output = gr.JSON(label="Output", visible=False)
for cb in [img_clear_button, clear_button]:
cb.click(
lambda _: (
gr.update(visible=False, value=None),
None,
gr.update(visible=False, value=None),
gr.update(visible=False),
None,
None,
None,
gr.update(visible=False, value=None),
None,
),
inputs=clear_button,
outputs=[
image,
document,
output,
img_clear_button,
example_image,
upload,
url,
url_error,
schema,
],
)
upload.change(
fn=process_upload,
inputs=[upload],
outputs=[document, image, img_clear_button, output, url_error],
)
submit.click(
fn=process_path,
inputs=[url],
outputs=[document, image, img_clear_button, output, url_error],
)
schema.submit(
fn=process_doc,
inputs=[document, schema, ocr_lang, layout_analysis],
outputs=[image, output],
)
submit_button.click(
fn=process_doc,
inputs=[document, schema, ocr_lang, layout_analysis],
outputs=[image, output],
)
example_image.change(
fn=load_example_document,
inputs=[example_image, example_schema, ocr_lang, layout_analysis],
outputs=[document, schema, image, img_clear_button, output],
)
gr.HTML(read_content("footer.html"))
if __name__ == "__main__":
demo.launch(enable_queue=False)
|