|
import os |
|
import json |
|
import gradio as gr |
|
from loguru import logger |
|
from magic_pdf.pipe.UNIPipe import UNIPipe |
|
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter |
|
import magic_pdf.model as model_config |
|
|
|
model_config.__use_inside_model__ = True |
|
|
|
|
|
def process_pdf(file_path): |
|
try: |
|
pdf_bytes = open(file_path, "rb").read() |
|
model_json = [] |
|
jso_useful_key = {"_pdf_type": "", "model_list": model_json} |
|
local_image_dir = os.path.join('uploads', 'images') |
|
if not os.path.exists(local_image_dir): |
|
os.makedirs(local_image_dir) |
|
image_dir = str(os.path.basename(local_image_dir)) |
|
image_writer = DiskReaderWriter(local_image_dir) |
|
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer) |
|
pipe.pipe_classify() |
|
if len(model_json) == 0: |
|
if model_config.__use_inside_model__: |
|
pipe.pipe_analyze() |
|
else: |
|
logger.error("need model list input") |
|
return None |
|
pipe.pipe_parse() |
|
md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none") |
|
return md_content |
|
except Exception as e: |
|
logger.exception(e) |
|
return None |
|
|
|
|
|
def extract_markdown_from_pdf(pdf): |
|
|
|
file_path = os.path.join('uploads', pdf.name) |
|
with open(file_path, 'wb') as f: |
|
f.write(pdf.read()) |
|
|
|
|
|
md_content = process_pdf(file_path) |
|
return md_content |
|
|
|
|
|
def main(): |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# PDF to Markdown Converter") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
pdf_file = gr.File(label="Upload PDF", file_types=['.pdf']) |
|
md_output = gr.Markdown(label="Extracted Markdown") |
|
|
|
extract_button = gr.Button("Extract Markdown") |
|
extract_button.click(extract_markdown_from_pdf, inputs=[ |
|
pdf_file], outputs=[md_output]) |
|
|
|
demo.launch(share=True) |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|