File size: 5,937 Bytes
711e2cf
62c2edb
 
711e2cf
a749292
be1e29d
62c2edb
a749292
79746f6
ac592f2
4d31938
af14138
a749292
61a37e8
18def71
af14138
 
 
61a37e8
4d31938
d1d6f35
4d31938
61a37e8
4d31938
18def71
711e2cf
 
 
 
 
 
 
 
 
 
 
 
 
4dcdb35
a749292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac592f2
dd08fd0
a749292
dd08fd0
18def71
61a37e8
dd08fd0
 
d0f2987
dd08fd0
 
a749292
711e2cf
 
 
 
a749292
dd08fd0
 
 
a749292
 
dd08fd0
 
 
 
a749292
dd08fd0
 
be1e29d
 
 
0b4c6fc
be1e29d
 
0b4c6fc
be1e29d
 
0b4c6fc
be1e29d
dd08fd0
be1e29d
 
0b4c6fc
be1e29d
a749292
dd08fd0
61a37e8
5f2ab23
79746f6
 
a749292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd08fd0
a749292
 
 
 
79746f6
 
61a37e8
0b4c6fc
 
 
 
 
 
 
 
 
 
65cf8b6
 
0b4c6fc
 
 
 
 
 
 
 
 
 
 
 
dd08fd0
 
0b4c6fc
dd08fd0
0b4c6fc
 
dd08fd0
65cf8b6
 
6f13504
0b4c6fc
6f13504
0b4c6fc
 
7ee1423
e9361c0
79746f6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import atexit
import base64
import os
import shutil
import tempfile
import time

import fitz
import gradio as gr
import spaces
import torch
import transformers
from PIL import Image, ImageEnhance
from transformers import AutoModel, AutoTokenizer

transformers.utils.move_cache()
transformers.logging.set_verbosity_error()

model_name = "ucaslcl/GOT-OCR2_0"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, device_map=device)
model = model.eval().to(device)

# 创建一个持久的临时目录
TEMP_DIR = tempfile.mkdtemp()


def cleanup():
    """清理临时目录"""
    shutil.rmtree(TEMP_DIR, ignore_errors=True)


# 确保在程序退出时清理临时目录

atexit.register(cleanup)


def pdf_to_images(pdf_path):
    images = []
    pdf_document = fitz.open(pdf_path)
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        zoom = 10  # 增加缩放比例到10
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat, alpha=False)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

        # 增对比度
        enhancer = ImageEnhance.Contrast(img)
        img = enhancer.enhance(1.5)  # 增加50%的对比度

        images.append(img)
    pdf_document.close()
    return images


@spaces.GPU()
def convert_pdf_to_images(file):
    if file is None:
        return "错误:未提供文件", None

    try:
        if not file.name.lower().endswith(".pdf"):
            return "错误:请上传PDF文件", None

        images = pdf_to_images(file.name)
        image_paths = []

        for i, image in enumerate(images):
            img_path = os.path.join(TEMP_DIR, f"page_{i+1}.png")
            image.save(img_path, "PNG")
            image_paths.append(img_path)

        return "PDF转换为图片成功", image_paths
    except Exception as e:
        return f"错误: {str(e)}", None


@spaces.GPU()
def ocr_process(image, got_mode, ocr_color="", ocr_box="", progress=gr.Progress()):
    if image is None:
        return "错误:未选择图片"

    try:
        progress(0, desc="开始处理...")

        # 模拟OCR处理的不同阶段
        progress(0.2, desc="图像预处理...")
        time.sleep(0.5)

        progress(0.4, desc="文字识别中...")
        time.sleep(0.5)

        progress(0.6, desc="后处理...")
        time.sleep(0.5)

        result = process_single_image(image, got_mode, ocr_color, ocr_box)

        progress(0.8, desc="生成结果...")
        time.sleep(0.5)

        progress(1, desc="处理完成")
        return result
    except Exception as e:
        return f"错误: {str(e)}"


def process_single_image(image_path, got_mode, ocr_color, ocr_box):
    result_path = f"{os.path.splitext(image_path)[0]}_result.html"

    if "plain" in got_mode:
        if "multi-crop" in got_mode:
            res = model.chat_crop(tokenizer, image_path, ocr_type="ocr")
        else:
            res = model.chat(tokenizer, image_path, ocr_type="ocr", ocr_box=ocr_box, ocr_color=ocr_color)
        return res
    elif "format" in got_mode:
        if "multi-crop" in got_mode:
            res = model.chat_crop(tokenizer, image_path, ocr_type="format", render=True, save_render_file=result_path)
        else:
            res = model.chat(tokenizer, image_path, ocr_type="format", ocr_box=ocr_box, ocr_color=ocr_color, render=True, save_render_file=result_path)

        if os.path.exists(result_path):
            with open(result_path, "r", encoding="utf-8") as f:
                html_content = f.read()
            encoded_html = base64.b64encode(html_content.encode("utf-8")).decode("utf-8")
            data_uri = f"data:text/html;charset=utf-8;base64,{encoded_html}"
            preview = f'<iframe src="{data_uri}" width="100%" height="600px"></iframe>'
            download_link = f'<a href="{data_uri}" download="result.html">下载完整结果</a>'
            return f"{download_link}\n\n{preview}"

    return "错误: 未知的OCR模式"


with gr.Blocks() as demo:
    gr.Markdown("# OCR 图像识别")

    with gr.Row():
        with gr.Column(scale=1):
            pdf_input = gr.File(label="上传PDF文件")
            convert_button = gr.Button("转换PDF为图片")

        with gr.Column(scale=2):
            image_gallery = gr.Gallery(label="图片预览", columns=3)

    with gr.Row():
        with gr.Column(scale=1):
            selected_image = gr.State(value=None)  # 使用 gr.State 来存储选中的图片路径
            preview_image = gr.Image(label="选中的图片", type="filepath")
            got_mode = gr.Dropdown(
                choices=[
                    "plain texts OCR",
                    "format texts OCR",
                    "plain multi-crop OCR",
                    "format multi-crop OCR",
                    "plain fine-grained OCR",
                    "format fine-grained OCR",
                ],
                label="OCR模式",
                value="plain texts OCR",
            )
            ocr_color = gr.Textbox(label="OCR颜色 (仅用于fine-grained模式)")
            ocr_box = gr.Textbox(label="OCR边界框 (仅用于fine-grained模式)")
            ocr_button = gr.Button("开始OCR识别")

        with gr.Column(scale=2):
            ocr_output = gr.HTML(label="识别结果")

    def select_image(evt: gr.SelectData, gallery):
        selected = gallery[evt.index]
        return selected

    image_gallery.select(select_image, image_gallery, selected_image)
    convert_button.click(convert_pdf_to_images, inputs=[pdf_input], outputs=[gr.Textbox(visible=False), image_gallery])
    ocr_button.click(ocr_process, inputs=[selected_image, got_mode, ocr_color, ocr_box], outputs=ocr_output)

if __name__ == "__main__":
    demo.launch()