Spaces:
Running
Running
import gradio as gr | |
import pdf_to_image | |
import image_to_text | |
from ml_engine.model_functions import is_it_title | |
def process_pdf(pdf): | |
# Ensure we get the correct path to the uploaded file | |
pdf_path = pdf.name # `pdf` is now a NamedString/TempFile with a `.name` attribute | |
pdf_pages_images = pdf_to_image.pdfToImg2(pdf_path) | |
pages = [] | |
curr_pg = "" | |
for img in pdf_pages_images: | |
text = image_to_text.img2string(img) | |
for line in text.split("\n"): | |
if(len(line) == 0): continue | |
if(is_it_title(line)): | |
# print(f"TITLE FOUND: {line}") #Debug statement | |
if(len(curr_pg) != 0): | |
pages.append(curr_pg) | |
curr_pg = "" | |
curr_pg = (curr_pg + line + "\n") | |
pages.append(curr_pg) | |
# print(pages) | |
return pages # Returning a list of strings | |
# Gradio interface using latest syntax | |
with gr.Blocks() as demo: | |
gr.Markdown("# PDF to Pages Processor") | |
gr.Markdown("Upload a PDF and get a list of extracted pages as output.") | |
# pdf_input = gr.File(label="Upload a PDF", file_types=[".pdf"]) | |
pdf_input = gr.File(label="Upload a PDF") | |
output = gr.JSON(label="Extracted Pages") | |
submit_button = gr.Button("Process PDF") | |
# Define interaction | |
submit_button.click(fn=process_pdf, inputs=pdf_input, outputs=output) | |
if __name__ == "__main__": | |
demo.launch() | |