import os import json import fitz import streamlit as st from PIL import Image import io import pandas as pd import pickle import zipfile import tempfile def extract_text_images( pdf_path: str, output_folder: str, minimum_font_size: int, mode: str = 'headerwise', header_font_sizes: list[float] = None, tolerance: float = 0.01, extraction_type: str = 'both', headers_are_capital: bool = False ) -> str: if not os.path.exists(output_folder): os.makedirs(output_folder) extraction_data = [] current_header = None current_header_content = [] def add_current_header_content() -> None: nonlocal current_header, current_header_content if current_header: extraction_data.append({ 'header': current_header, 'content': current_header_content }) current_header_content = [] current_header = None def is_header_font_size(font_size: float) -> bool: return any( abs(font_size - header_font_size) <= tolerance for header_font_size in header_font_sizes ) def is_bold(font: str) -> bool: return 'bold' in font.lower() pdf_document = fitz.open(pdf_path) for page_number in range(pdf_document.page_count): page = pdf_document.load_page(page_number) elements = [] if extraction_type in ('text', 'both'): text_blocks = page.get_text("dict")["blocks"] lines = {} for block in text_blocks: if block["type"] == 0: # Text block for line in block["lines"]: for span in line["spans"]: font_size = span["size"] top = span["bbox"][1] font = span["font"] if font_size < minimum_font_size: continue if top not in lines: lines[top] = [] lines[top].append(span) for top in sorted(lines.keys()): line = lines[top] line_text = " ".join([span['text'] for span in line]) line_font_size = line[0]['size'] font = line[0]['font'] if headers_are_capital: line_text_is_header = line_text.isupper() else: line_text_is_header = True elements.append({ 'type': 'text', 'font_size': line_font_size, 'page': page_number + 1, 'content': line_text, 'x0': line[0]['bbox'][0], 'top': top, 'font': font, 'is_header': line_text_is_header }) if extraction_type in ('images', 'both'): image_list = page.get_images(full=True) for img_index, img in enumerate(image_list): xref = img[0] base_image = pdf_document.extract_image(xref) image_bytes = base_image["image"] image_filename = os.path.join( output_folder, f"page_{page_number + 1}_img_{img_index + 1}.png" ) with open(image_filename, "wb") as img_file: img_file.write(image_bytes) img_rect = page.get_image_bbox(img) elements.append({ 'type': 'image', 'page': page_number + 1, 'path': image_filename, 'x0': img_rect.x0, 'top': img_rect.y0 }) elements.sort(key=lambda e: (e['top'], e['x0'])) if mode == 'headerwise': for element in elements: if element['type'] == 'text' and element['is_header'] and is_header_font_size(element['font_size']) and is_bold(element['font']): add_current_header_content() current_header = element['content'] elif element['type'] == 'text': if current_header_content and current_header_content[-1]['type'] == 'text': current_header_content[-1]['content'] += " " + element['content'] else: current_header_content.append({ 'type': 'text', 'content': element['content'] }) elif element['type'] == 'image': current_header_content.append({ 'type': 'image', 'path': element['path'] }) if mode == 'headerwise': add_current_header_content() pdf_document.close() json_output_path = os.path.join(output_folder, 'extraction_data.json') with open(json_output_path, 'w', encoding='utf-8') as json_file: json.dump(extraction_data, json_file, ensure_ascii=False, indent=4) # Save to XLSX df = pd.json_normalize(extraction_data, sep='_') xlsx_output_path = os.path.join(output_folder, 'extraction_data.xlsx') df.to_excel(xlsx_output_path, index=False) # Save to Pickle pickle_output_path = os.path.join(output_folder, 'extraction_data.pkl') with open(pickle_output_path, 'wb') as pickle_file: pickle.dump(extraction_data, pickle_file) # Create ZIP file zip_output_path = os.path.join(output_folder, 'extraction_data.zip') with zipfile.ZipFile(zip_output_path, 'w') as zipf: zipf.write(json_output_path, os.path.basename(json_output_path)) zipf.write(xlsx_output_path, os.path.basename(xlsx_output_path)) zipf.write(pickle_output_path, os.path.basename(pickle_output_path)) if extraction_type in ('images', 'both'): for root, _, files in os.walk(output_folder): for file in files: if file.endswith('.png'): zipf.write(os.path.join(root, file), file) return json_output_path, xlsx_output_path, pickle_output_path, zip_output_path def render_pdf_page_as_image(pdf_path: str, page_number: int, zoom: float = 2.0) -> io.BytesIO: # Render PDF page as an image pdf_document = fitz.open(pdf_path) page = pdf_document.load_page(page_number - 1) # Page number is zero-indexed in fitz pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom)) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) img_bytes = io.BytesIO() img.save(img_bytes, format="PNG") img_bytes.seek(0) pdf_document.close() return img_bytes # Streamlit UI st.markdown("