File size: 2,117 Bytes
240e0a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
import os
import json
import gradio as gr
from loguru import logger
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
import magic_pdf.model as model_config
model_config.__use_inside_model__ = True
def process_pdf(file_path):
try:
pdf_bytes = open(file_path, "rb").read()
model_json = [] # model_json传空list使用内置模型解析
jso_useful_key = {"_pdf_type": "", "model_list": model_json}
local_image_dir = os.path.join('uploads', 'images')
if not os.path.exists(local_image_dir):
os.makedirs(local_image_dir)
image_dir = str(os.path.basename(local_image_dir))
image_writer = DiskReaderWriter(local_image_dir)
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
pipe.pipe_classify()
if len(model_json) == 0:
if model_config.__use_inside_model__:
pipe.pipe_analyze()
else:
logger.error("need model list input")
return None
pipe.pipe_parse()
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
return md_content
except Exception as e:
logger.exception(e)
return None
def extract_markdown_from_pdf(pdf):
# 保存上传的PDF文件
file_path = os.path.join('uploads', pdf.name)
with open(file_path, 'wb') as f:
f.write(pdf.read())
# 处理PDF文件并生成Markdown内容
md_content = process_pdf(file_path)
return md_content
def main():
# 创建Gradio接口
with gr.Blocks() as demo:
gr.Markdown("# PDF to Markdown Converter")
with gr.Row():
with gr.Column():
pdf_file = gr.File(label="Upload PDF", file_types=['.pdf'])
md_output = gr.Markdown(label="Extracted Markdown")
extract_button = gr.Button("Extract Markdown")
extract_button.click(extract_markdown_from_pdf, inputs=[
pdf_file], outputs=[md_output])
demo.launch(share=True)
if __name__ == '__main__':
main()
|