from typing import Optional from llama_index.core.readers.base import BaseReader from llama_index.core.schema import Document from fastapi import UploadFile from typing import List from PyPDF2 import PdfReader from io import BytesIO import fitz # PyMuPDF class Reader(BaseReader): async def read_from_uploadfile(self, file: UploadFile) -> List[Document]: try: # Read the file content asynchronously file_content = await file.read() # Initialize PyMuPDF document with file content pdf_document = fitz.open(stream=file_content, filetype="pdf") # Extract text and images from each page pages = [] for page_num in range(len(pdf_document)): page = pdf_document.load_page(page_num) # Extract text text = page.get_text().strip() if text: pages.append(Document(text=text, metadata={"page": page_num + 1})) # Extract images for img_index, img in enumerate(page.get_images(full=True)): xref = img[0] base_image = pdf_document.extract_image(xref) image_bytes = base_image["image"] image_stream = BytesIO(image_bytes) # Store the image as a Document (or any other structure you need) pages.append( Document( text=f"Image {img_index + 1} on page {page_num + 1}", metadata={ "page": page_num + 1, "image_index": img_index + 1, "image": image_stream, }, ) ) return pages except Exception as e: # Handle exceptions more granularly if needed print(f"Error reading PDF file: {e}") raise RuntimeError(f"Failed to process the uploaded file: {e}")