import os import json import gradio as gr from loguru import logger from magic_pdf.pipe.UNIPipe import UNIPipe from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter import magic_pdf.model as model_config model_config.__use_inside_model__ = True def process_pdf(file_path): try: pdf_bytes = open(file_path, "rb").read() model_json = [] # model_json传空list使用内置模型解析 jso_useful_key = {"_pdf_type": "", "model_list": model_json} local_image_dir = os.path.join('uploads', 'images') if not os.path.exists(local_image_dir): os.makedirs(local_image_dir) image_dir = str(os.path.basename(local_image_dir)) image_writer = DiskReaderWriter(local_image_dir) pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer) pipe.pipe_classify() if len(model_json) == 0: if model_config.__use_inside_model__: pipe.pipe_analyze() else: logger.error("need model list input") return None pipe.pipe_parse() md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none") return md_content except Exception as e: logger.exception(e) return None def extract_markdown_from_pdf(pdf): # 保存上传的PDF文件 file_path = os.path.join('uploads', pdf.name) with open(file_path, 'wb') as f: f.write(pdf.read()) # 处理PDF文件并生成Markdown内容 md_content = process_pdf(file_path) return md_content def main(): # 创建Gradio接口 with gr.Blocks() as demo: gr.Markdown("# PDF to Markdown Converter") with gr.Row(): with gr.Column(): pdf_file = gr.File(label="Upload PDF", file_types=['.pdf']) md_output = gr.Markdown(label="Extracted Markdown") extract_button = gr.Button("Extract Markdown") extract_button.click(extract_markdown_from_pdf, inputs=[ pdf_file], outputs=[md_output]) demo.launch(share=True) if __name__ == '__main__': main()