import streamlit as st import os import pandas as pd from azure.ai.formrecognizer import DocumentAnalysisClient from azure.core.credentials import AzureKeyCredential from PyPDF2 import PdfReader, PdfWriter from io import BytesIO YOUR_ENDPOINT = os.environ["YOUR_ENDPOINT"] YOUR_KEY = os.environ["YOUR_KEY"] st.set_page_config( page_title="PDF Table Extractor", layout="centered", initial_sidebar_state="auto" ) document_analysis_client = DocumentAnalysisClient( endpoint=YOUR_ENDPOINT, credential=AzureKeyCredential(YOUR_KEY) ) # Function to convert table cells to pandas DataFrame def table2pandas(table): data = [] for cell in table.cells: while len(data) <= cell.row_index: data.append([]) while len(data[cell.row_index]) <= cell.column_index: data[cell.row_index].append("") data[cell.row_index][cell.column_index] = cell.content return pd.DataFrame(data) # Function to split PDF into pages def split_pdf_to_pages(filepath): input_pdf = PdfReader(filepath) pages = [] for page_num in range(len(input_pdf.pages)): output_pdf = PdfWriter() output_pdf.add_page(input_pdf.pages[page_num]) page_stream = BytesIO() output_pdf.write(page_stream) page_stream.seek(0) pages.append(page_stream.read()) return pages # Streamlit app def main(): st.title("PDF Table Extractor") # Upload PDF file uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"]) if uploaded_file is not None: # Temporarily save uploaded PDF os.makedirs("temp_files", exist_ok=True) temp_filepath = os.path.join("temp_files", uploaded_file.name) with open(temp_filepath, "wb") as f: f.write(uploaded_file.getbuffer()) st.text("Uploaded successfully. Extracting tables...") # Process the uploaded PDF pages = split_pdf_to_pages(temp_filepath) for page_num, page_bytes in enumerate(pages): poller = document_analysis_client.begin_analyze_document( "prebuilt-layout", document=page_bytes) result = poller.result() if hasattr(result, 'tables') and result.tables: for table_num, table in enumerate(result.tables): table_df = table2pandas(table) st.write(table_df) # Display table in Streamlit (optional) # Provide a download link for the CSV file csv_file = table_df.to_csv(index=False).encode('utf-8') st.download_button( label="Download CSV", data=csv_file, file_name=f"{os.path.basename(uploaded_file.name).replace('.pdf', '')}_page{page_num + 1}_table{table_num}.csv", mime="text/csv" ) st.success("Tables extracted and saved successfully!") if __name__ == "__main__": main()