import gradio as gr import fitz # PyMuPDF import pandas as pd from transformers import pipeline import base64 # Function to convert PDF to DataFrame def pdf_to_dataframe(uploaded_file): # Open the PDF document # doc = fitz.open(pdf_path) # # Initialize an empty list to store text blocks # text_blocks = [] # # Iterate through each page in the PDF # for page_num in range(len(doc)): # page = doc.load_page(page_num) # text = page.get_text("text") # print(text) # text_blocks.append(text) # # Join all text blocks into a single string # full_text = "\n".join(text_blocks) # # Split the text into lines # lines = full_text.split('\n') # # Create a DataFrame from the lines if uploaded_file is not None: ocr_pipeline = pipeline("text2text-generation", model="google/t5-v1_1-large") extracted_text = ocr_pipeline(uploaded_file.read(), max_length=1024, do_sample=False)[0]["generated_text"] lines = extracted_text.split("\n") data = [] for line in lines: data.append([line]) df = pd.DataFrame(data, columns=["Text"]) # df = pd.DataFrame(lines, columns=['Text']) return df # Function to save DataFrame to Excel def dataframe_to_excel(df, excel_path): # Save the DataFrame to an Excel file df.to_excel(excel_path, index=False) # Main function def main(): def pdf_to_excel_function(pdf_file): # Save the uploaded PDF to a temporary file pdf_path = "temp.pdf" # with open(pdf_path, "wb") as f: # f.write(pdf_file.read()) # Convert PDF to DataFrame df = pdf_to_dataframe(pdf_file) # Save DataFrame to Excel excel_path = "output.xlsx" dataframe_to_excel(df, excel_path) return excel_path # Create the Gradio interface iface = gr.Interface( fn=pdf_to_excel_function, inputs=gr.File(label="Upload PDF File"), outputs=gr.File(label="Download Excel File"), title="PDF to Excel Converter", description="Convert a PDF file to an Excel file." ) # Launch the interface iface.launch() if __name__ == "__main__": main()