Spaces:
Sleeping
Sleeping
# pdf_classes.py | |
from docarray import BaseDoc | |
from docarray import DocList | |
from docarray.typing import ImageTensor, NdArray | |
from typing import Dict, Optional | |
class PDFSegment(BaseDoc): | |
page_number: int | |
segment_type: str # 'text', 'image', 'table', or 'hybrid' | |
content: Optional[str] | |
image: Optional[ImageTensor] | |
position: Dict[str, int] # {x, y, width, height} | |
relationships: Dict[str, Optional[str]] # {'prev': id, 'next': id, 'parent': id} | |
embedding: Optional[NdArray[768]] | |
class PDFPage(BaseDoc): | |
page_number: int | |
screenshot: ImageTensor | |
embedding: Optional[NdArray[768]] = None | |
class RichPDFDocument(BaseDoc): | |
file_path: str | |
num_pages: int | |
segments: DocList[PDFSegment] | |
pages: DocList[PDFPage] | |