import gradio as gr import subprocess import uuid import os import requests import re def get_pdf(pdf_link): # Generate a unique filename unique_filename = f"input/downloaded_paper_{uuid.uuid4().hex}.pdf" # Send a GET request to the PDF link response = requests.get(pdf_link) if response.status_code == 200: # Save the PDF content to a local file with open(unique_filename, 'wb') as pdf_file: pdf_file.write(response.content) print("PDF downloaded successfully.") else: print("Failed to download the PDF.") return unique_filename #.split('/')[-1][:-4] def nougat_ocr(file_name): #unique_filename = f"/content/output/downloaded_paper_{uuid.uuid4().hex}.pdf" # Command to run cli_command = [ 'nougat', #'--out', unique_filename, '--out', 'output', 'pdf', f'{file_name}', '--checkpoint', 'nougat', '--markdown' ] # Run the command and capture its output #completed_process = subprocess.run(cli_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) return #unique_filename def predict(pdf_file, pdf_link): if pdf_file is None: if pdf_link == '': print("No file is uploaded and No link is provided") return "No data provided. Upload a pdf file or provide a pdf link and try again!" else: print(f'pdf_link is - {pdf_link}') file_name = get_pdf(pdf_link) print(f'file_name is - {file_name}') else: file_name = pdf_file.name print(file_name) pdf_name = pdf_file.name.split('/')[-1].split('.')[0] print(pdf_name) # Call nougat nougat_ocr(file_name) #print("BACKKKK") # Open the file for reading file_name = file_name.split('/')[-1][:-4] with open(f'output/{file_name}.mmd', 'r') as file: content = file.read() # switch math delimiters content = content.replace(r'\(', '$').replace(r'\)', '$').replace(r'\[', '$$').replace(r'\]', '$$') return content def nougat_ocr1(file_name): print('******* inside nougat_ocr *******') # CLI Command to run cli_command = [ 'nougat', '--out', 'output', 'pdf', f'{file_name}', '--checkpoint', 'nougat', '--markdown' ] # Run the command and get .mmd file in an output folder subprocess.run(cli_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) return def predict1(pdf_file): print('******* inside predict *******') print(f"temporary file - {pdf_file.name}") pdf_name = pdf_file.name.split('/')[-1].split('.')[0] print(f"pdf file name - {pdf_name}") #! Get prediction for a PDF using nougat nougat_ocr(pdf_file.name) print("BAACCKKK") # Open the multimarkdown (.mmd) file for reading with open(f'output/{pdf_name}.mmd', 'r') as file: content = file.read() return content def process_example(pdf_file,pdf_link): ocr_content = predict(pdf_file,pdf_link) return gr.update(value=ocr_content) css = """ #mkd { height: 500px; overflow: auto; border: 1px solid #ccc; } """ with gr.Blocks(css=css) as demo: gr.HTML("