File size: 5,368 Bytes
4dcdb35
18def71
4dcdb35
 
8ec1357
4dcdb35
18def71
 
 
 
 
 
 
 
 
4dcdb35
 
 
 
18def71
 
 
 
4dcdb35
8ec1357
 
 
 
 
18def71
 
 
 
8ec1357
18def71
 
 
 
 
8ec1357
 
 
4dcdb35
 
8ec1357
18def71
 
 
8ec1357
18def71
 
 
 
8ec1357
 
18def71
 
 
 
 
 
 
 
 
4dcdb35
 
18def71
 
 
 
4dcdb35
 
7ee1423
 
 
 
 
 
 
 
 
 
 
 
 
 
4dcdb35
 
 
8ec1357
 
18def71
7ee1423
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18def71
 
4dcdb35
7ee1423
 
 
 
 
8ec1357
18def71
 
4dcdb35
18def71
e9361c0
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import os
import shutil
import uuid

import fitz  # PyMuPDF
import gradio as gr
from modelscope import AutoModel, AutoTokenizer
from PIL import Image, ImageEnhance

from got_ocr import got_ocr

# 初始化模型和分词器
tokenizer = AutoTokenizer.from_pretrained("stepfun-ai/GOT-OCR2_0", trust_remote_code=True)
model = AutoModel.from_pretrained("stepfun-ai/GOT-OCR2_0", trust_remote_code=True, low_cpu_mem_usage=True, device_map="cuda", use_safetensors=True)
model = model.eval().cuda()

UPLOAD_FOLDER = "./uploads"
RESULTS_FOLDER = "./results"

# 确保必要的文件夹存在
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(RESULTS_FOLDER, exist_ok=True)


def pdf_to_images(pdf_path):
    images = []
    pdf_document = fitz.open(pdf_path)
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        # 进一步增加分辨率和缩放比例
        zoom = 4  # 增加缩放比例到4
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat, alpha=False)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

        # 增加对比度
        enhancer = ImageEnhance.Contrast(img)
        img = enhancer.enhance(1.5)  # 增加50%的对比度

        images.append(img)
    pdf_document.close()
    return images


def process_pdf(pdf_file):
    if pdf_file is None:
        return None

    temp_pdf_path = os.path.join(UPLOAD_FOLDER, f"{uuid.uuid4()}.pdf")

    # 使用 shutil 复制上传的件到临时位置
    shutil.copy(pdf_file.name, temp_pdf_path)

    images = pdf_to_images(temp_pdf_path)
    os.remove(temp_pdf_path)

    # 将图像保存为临时文件并返回文件路径列表
    image_paths = []
    for i, img in enumerate(images):
        img_path = os.path.join(RESULTS_FOLDER, f"page_{i+1}.png")
        img.save(img_path, "PNG")
        image_paths.append(img_path)

    return image_paths


def on_image_select(evt: gr.SelectData):
    if evt.index is not None:
        return evt.index
    return None


# 更新perform_ocr函数的输入参数
def perform_ocr(selected_index, image_gallery, task, fine_grained_type, color, box):
    if selected_index is None or len(image_gallery) == 0:
        return "请先选择一张图片"

    selected_image = image_gallery[selected_index]

    # 根据选择的任务和参数调用GOT OCR
    got_mode = task
    ocr_color = color if fine_grained_type == "color" else ""
    ocr_box = box if fine_grained_type == "box" else ""

    result, _ = got_ocr(model, tokenizer, selected_image, got_mode=got_mode, fine_grained_mode=fine_grained_type, ocr_color=ocr_color, ocr_box=ocr_box)
    return result


with gr.Blocks() as demo:
    pdf_input = gr.File(label="上传PDF文件")
    image_gallery = gr.Gallery(label="PDF页面预览", columns=3, height="auto")
    selected_index = gr.State(None)
    task_dropdown = gr.Dropdown(
        choices=[
            "plain texts OCR",
            "format texts OCR",
            "plain multi-crop OCR",
            "format multi-crop OCR",
            "plain fine-grained OCR",
            "format fine-grained OCR",
        ],
        label="选择GOT模式",
        value="plain texts OCR",
    )
    fine_grained_dropdown = gr.Dropdown(choices=["box", "color"], label="fine-grained类型", visible=False)
    color_dropdown = gr.Dropdown(choices=["red", "green", "blue"], label="颜色列表", visible=False)
    box_input = gr.Textbox(label="输入框: [x1,y1,x2,y2]", placeholder="例如: [0,0,100,100]", visible=False)

    def task_update(task):
        if "fine-grained" in task:
            return [
                gr.update(visible=True),
                gr.update(visible=False),
                gr.update(visible=False),
            ]
        else:
            return [
                gr.update(visible=False),
                gr.update(visible=False),
                gr.update(visible=False),
            ]

    def fine_grained_update(fine_grained_type):
        if fine_grained_type == "color":
            return [
                gr.update(visible=True),
                gr.update(visible=False),
            ]
        elif fine_grained_type == "box":
            return [
                gr.update(visible=False),
                gr.update(visible=True),
            ]
        else:
            return [
                gr.update(visible=False),
                gr.update(visible=False),
            ]

    task_dropdown.change(task_update, inputs=[task_dropdown], outputs=[fine_grained_dropdown, color_dropdown, box_input])
    fine_grained_dropdown.change(fine_grained_update, inputs=[fine_grained_dropdown], outputs=[color_dropdown, box_input])

    ocr_button = gr.Button("开始OCR识别")
    ocr_result = gr.Textbox(label="OCR结果")

    # 更新ocr_button的click事件,传递所有必要的参数
    ocr_button.click(
        fn=perform_ocr, inputs=[selected_index, image_gallery, task_dropdown, fine_grained_dropdown, color_dropdown, box_input], outputs=ocr_result
    )

    pdf_input.upload(fn=process_pdf, inputs=pdf_input, outputs=image_gallery)
    image_gallery.select(fn=on_image_select, inputs=[], outputs=selected_index)
    ocr_button.click(fn=perform_ocr, inputs=[selected_index, image_gallery], outputs=ocr_result)

    # 移除了选中图片的显示部分

if __name__ == "__main__":
    demo.launch()