Spaces:
Running
on
L4
Running
on
L4
File size: 6,955 Bytes
0cc1374 6d920fd 66eda5d 21435e3 0cc1374 f78735b 21435e3 0cc1374 66eda5d c36d1d9 1d8778f 0cc1374 3237819 0cc1374 b07522c 0cc1374 21435e3 0cc1374 1d8778f 0cc1374 b07522c 66eda5d b07522c 66eda5d b07522c 0cc1374 21435e3 0cc1374 21435e3 1d8778f 21435e3 0cc1374 21435e3 0cc1374 21435e3 0cc1374 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
# Copyright (c) Opendatalab. All rights reserved.
import base64
import os
import time
import zipfile
from pathlib import Path
import re
os.system('pip install -r requirements.txt')
os.system('pip install -U magic_pdf-0.8.0a3-py3-none-any.whl')
os.system('python -m pip install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/')
from huggingface_hub import snapshot_download
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit')
os.system('wget https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json')
os.system('cp magic-pdf.template.json ~/magic-pdf.json')
os.system(f"sed -i 's|/tmp/models|{model_dir}/models|g' /home/user/magic-pdf.json")
os.system("sed -i 's|cpu|cuda|g' /home/user/magic-pdf.json")
os.system('cp -r paddleocr /home/user/.paddleocr')
os.system("pip install gradio-pdf")
from gradio_pdf import PDF
import gradio as gr
from loguru import logger
from magic_pdf.libs.hash_utils import compute_sha256
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.tools.common import do_parse, prepare_env
import spaces
def read_fn(path):
disk_rw = DiskReaderWriter(os.path.dirname(path))
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
@spaces.GPU
def parse_pdf(doc_path, output_dir, end_page_id):
os.makedirs(output_dir, exist_ok=True)
try:
file_name = f"{str(Path(doc_path).stem)}_{time.time()}"
pdf_data = read_fn(doc_path)
parse_method = "auto"
local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method)
do_parse(
output_dir,
file_name,
pdf_data,
[],
parse_method,
False,
end_page_id=end_page_id,
)
return local_md_dir, file_name
except Exception as e:
logger.exception(e)
def compress_directory_to_zip(directory_path, output_zip_path):
"""
压缩指定目录到一个 ZIP 文件。
:param directory_path: 要压缩的目录路径
:param output_zip_path: 输出的 ZIP 文件路径
"""
try:
with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
# 遍历目录中的所有文件和子目录
for root, dirs, files in os.walk(directory_path):
for file in files:
# 构建完整的文件路径
file_path = os.path.join(root, file)
# 计算相对路径
arcname = os.path.relpath(file_path, directory_path)
# 添加文件到 ZIP 文件
zipf.write(file_path, arcname)
return 0
except Exception as e:
logger.exception(e)
return -1
def image_to_base64(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def replace_image_with_base64(markdown_text, image_dir_path):
# 匹配Markdown中的图片标签
pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)'
# 替换图片链接
def replace(match):
relative_path = match.group(1)
full_path = os.path.join(image_dir_path, relative_path)
base64_image = image_to_base64(full_path)
return f"![{relative_path}](data:image/jpeg;base64,{base64_image})"
# 应用替换
return re.sub(pattern, replace, markdown_text)
def to_markdown(file_path, end_pages):
# 获取识别的md文件以及压缩包文件路径
local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1)
archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
if zip_archive_success == 0:
logger.info("压缩成功")
else:
logger.error("压缩失败")
md_path = os.path.join(local_md_dir, file_name + ".md")
with open(md_path, 'r', encoding='utf-8') as f:
txt_content = f.read()
md_content = replace_image_with_base64(txt_content, local_md_dir)
# 返回转换后的PDF路径
new_pdf_path = os.path.join(local_md_dir, file_name + "_layout.pdf")
# return md_content, txt_content, archive_zip_path, show_pdf(new_pdf_path)
return md_content, txt_content, archive_zip_path, new_pdf_path
# def show_pdf(file_path):
# with open(file_path, "rb") as f:
# base64_pdf = base64.b64encode(f.read()).decode('utf-8')
# pdf_display = f'<embed src="data:application/pdf;base64,{base64_pdf}" ' \
# f'width="100%" height="1000" type="application/pdf">'
# return pdf_display
def show_pdf(file):
return file
latex_delimiters = [{"left": "$$", "right": "$$", "display": True},
{"left": '$', "right": '$', "display": False}]
from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
def init_model():
from magic_pdf.model.doc_analyze_by_custom_model import ModelSingleton
try:
model_manager = ModelSingleton()
txt_model = model_manager.get_model(False, False)
logger.info(f"txt_model init final")
# ocr_model = model_manager.get_model(True, False)
# logger.info(f"ocr_model init final")
return 0
except Exception as e:
logger.exception(e)
return -1
model_init = init_model()
logger.info(f"model_init: {model_init}")
if __name__ == "__main__":
with gr.Blocks() as demo:
with gr.Row():
with gr.Column(variant='panel', scale=5):
# file = gr.File(label="Please upload pdf", file_types=[".pdf"])
pdf_show = gr.Markdown()
max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
with gr.Row() as bu_flow:
change_bu = gr.Button("Convert")
clear_bu = gr.ClearButton([pdf_show], value="Clear")
# pdf_show = gr.HTML(label="PDF preview")
pdf_show = PDF(label="Please upload pdf", interactive=True, height=800)
with gr.Column(variant='panel', scale=5):
output_file = gr.File(label="convert result", interactive=False)
with gr.Tabs():
with gr.Tab("Markdown rendering"):
md = gr.Markdown(label="Markdown rendering", height=900, show_copy_button=True,
latex_delimiters=latex_delimiters, line_breaks=True)
with gr.Tab("Markdown text"):
md_text = gr.TextArea(lines=45, show_copy_button=True)
# file.upload(fn=show_pdf, inputs=file, outputs=pdf_show)
change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages], outputs=[md, md_text, output_file, pdf_show])
clear_bu.add([md, pdf_show, md_text, output_file])
demo.launch()
|