Spaces:
Running
Running
from typing import Optional | |
from llama_index.core.readers.base import BaseReader | |
from llama_index.core.schema import Document | |
from fastapi import UploadFile | |
from typing import List | |
from PyPDF2 import PdfReader | |
from io import BytesIO | |
import fitz # PyMuPDF | |
class Reader(BaseReader): | |
async def read_from_uploadfile(self, file: UploadFile) -> List[Document]: | |
try: | |
# Read the file content asynchronously | |
file_content = await file.read() | |
# Initialize PyMuPDF document with file content | |
pdf_document = fitz.open(stream=file_content, filetype="pdf") | |
# Extract text and images from each page | |
pages = [] | |
for page_num in range(len(pdf_document)): | |
page = pdf_document.load_page(page_num) | |
# Extract text | |
text = page.get_text().strip() | |
if text: | |
pages.append(Document(text=text, metadata={"page": page_num + 1})) | |
# Extract images | |
for img_index, img in enumerate(page.get_images(full=True)): | |
xref = img[0] | |
base_image = pdf_document.extract_image(xref) | |
image_bytes = base_image["image"] | |
image_stream = BytesIO(image_bytes) | |
# Store the image as a Document (or any other structure you need) | |
pages.append( | |
Document( | |
text=f"Image {img_index + 1} on page {page_num + 1}", | |
metadata={ | |
"page": page_num + 1, | |
"image_index": img_index + 1, | |
"image": image_stream, | |
}, | |
) | |
) | |
return pages | |
except Exception as e: | |
# Handle exceptions more granularly if needed | |
print(f"Error reading PDF file: {e}") | |
raise RuntimeError(f"Failed to process the uploaded file: {e}") | |