Spaces:
Running
Running
File size: 5,368 Bytes
4dcdb35 18def71 4dcdb35 8ec1357 4dcdb35 18def71 4dcdb35 18def71 4dcdb35 8ec1357 18def71 8ec1357 18def71 8ec1357 4dcdb35 8ec1357 18def71 8ec1357 18def71 8ec1357 18def71 4dcdb35 18def71 4dcdb35 7ee1423 4dcdb35 8ec1357 18def71 7ee1423 18def71 4dcdb35 7ee1423 8ec1357 18def71 4dcdb35 18def71 e9361c0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import os
import shutil
import uuid
import fitz # PyMuPDF
import gradio as gr
from modelscope import AutoModel, AutoTokenizer
from PIL import Image, ImageEnhance
from got_ocr import got_ocr
# 初始化模型和分词器
tokenizer = AutoTokenizer.from_pretrained("stepfun-ai/GOT-OCR2_0", trust_remote_code=True)
model = AutoModel.from_pretrained("stepfun-ai/GOT-OCR2_0", trust_remote_code=True, low_cpu_mem_usage=True, device_map="cuda", use_safetensors=True)
model = model.eval().cuda()
UPLOAD_FOLDER = "./uploads"
RESULTS_FOLDER = "./results"
# 确保必要的文件夹存在
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(RESULTS_FOLDER, exist_ok=True)
def pdf_to_images(pdf_path):
images = []
pdf_document = fitz.open(pdf_path)
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
# 进一步增加分辨率和缩放比例
zoom = 4 # 增加缩放比例到4
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat, alpha=False)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# 增加对比度
enhancer = ImageEnhance.Contrast(img)
img = enhancer.enhance(1.5) # 增加50%的对比度
images.append(img)
pdf_document.close()
return images
def process_pdf(pdf_file):
if pdf_file is None:
return None
temp_pdf_path = os.path.join(UPLOAD_FOLDER, f"{uuid.uuid4()}.pdf")
# 使用 shutil 复制上传的件到临时位置
shutil.copy(pdf_file.name, temp_pdf_path)
images = pdf_to_images(temp_pdf_path)
os.remove(temp_pdf_path)
# 将图像保存为临时文件并返回文件路径列表
image_paths = []
for i, img in enumerate(images):
img_path = os.path.join(RESULTS_FOLDER, f"page_{i+1}.png")
img.save(img_path, "PNG")
image_paths.append(img_path)
return image_paths
def on_image_select(evt: gr.SelectData):
if evt.index is not None:
return evt.index
return None
# 更新perform_ocr函数的输入参数
def perform_ocr(selected_index, image_gallery, task, fine_grained_type, color, box):
if selected_index is None or len(image_gallery) == 0:
return "请先选择一张图片"
selected_image = image_gallery[selected_index]
# 根据选择的任务和参数调用GOT OCR
got_mode = task
ocr_color = color if fine_grained_type == "color" else ""
ocr_box = box if fine_grained_type == "box" else ""
result, _ = got_ocr(model, tokenizer, selected_image, got_mode=got_mode, fine_grained_mode=fine_grained_type, ocr_color=ocr_color, ocr_box=ocr_box)
return result
with gr.Blocks() as demo:
pdf_input = gr.File(label="上传PDF文件")
image_gallery = gr.Gallery(label="PDF页面预览", columns=3, height="auto")
selected_index = gr.State(None)
task_dropdown = gr.Dropdown(
choices=[
"plain texts OCR",
"format texts OCR",
"plain multi-crop OCR",
"format multi-crop OCR",
"plain fine-grained OCR",
"format fine-grained OCR",
],
label="选择GOT模式",
value="plain texts OCR",
)
fine_grained_dropdown = gr.Dropdown(choices=["box", "color"], label="fine-grained类型", visible=False)
color_dropdown = gr.Dropdown(choices=["red", "green", "blue"], label="颜色列表", visible=False)
box_input = gr.Textbox(label="输入框: [x1,y1,x2,y2]", placeholder="例如: [0,0,100,100]", visible=False)
def task_update(task):
if "fine-grained" in task:
return [
gr.update(visible=True),
gr.update(visible=False),
gr.update(visible=False),
]
else:
return [
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
]
def fine_grained_update(fine_grained_type):
if fine_grained_type == "color":
return [
gr.update(visible=True),
gr.update(visible=False),
]
elif fine_grained_type == "box":
return [
gr.update(visible=False),
gr.update(visible=True),
]
else:
return [
gr.update(visible=False),
gr.update(visible=False),
]
task_dropdown.change(task_update, inputs=[task_dropdown], outputs=[fine_grained_dropdown, color_dropdown, box_input])
fine_grained_dropdown.change(fine_grained_update, inputs=[fine_grained_dropdown], outputs=[color_dropdown, box_input])
ocr_button = gr.Button("开始OCR识别")
ocr_result = gr.Textbox(label="OCR结果")
# 更新ocr_button的click事件,传递所有必要的参数
ocr_button.click(
fn=perform_ocr, inputs=[selected_index, image_gallery, task_dropdown, fine_grained_dropdown, color_dropdown, box_input], outputs=ocr_result
)
pdf_input.upload(fn=process_pdf, inputs=pdf_input, outputs=image_gallery)
image_gallery.select(fn=on_image_select, inputs=[], outputs=selected_index)
ocr_button.click(fn=perform_ocr, inputs=[selected_index, image_gallery], outputs=ocr_result)
# 移除了选中图片的显示部分
if __name__ == "__main__":
demo.launch()
|