# pdf_classes.py from docarray import BaseDoc from docarray import DocList from docarray.typing import ImageTensor, NdArray from typing import Dict, Optional class PDFSegment(BaseDoc): page_number: int segment_type: str # 'text', 'image', 'table', or 'hybrid' content: Optional[str] image: Optional[ImageTensor] position: Dict[str, int] # {x, y, width, height} relationships: Dict[str, Optional[str]] # {'prev': id, 'next': id, 'parent': id} embedding: Optional[NdArray[768]] class PDFPage(BaseDoc): page_number: int screenshot: ImageTensor embedding: Optional[NdArray[768]] = None class RichPDFDocument(BaseDoc): file_path: str num_pages: int segments: DocList[PDFSegment] pages: DocList[PDFPage]