Research-PDFs-VQA / pdf_classes.py
sugiv's picture
Adding initial set of files
452c0e2
raw
history blame contribute delete
760 Bytes
# pdf_classes.py
from docarray import BaseDoc
from docarray import DocList
from docarray.typing import ImageTensor, NdArray
from typing import Dict, Optional
class PDFSegment(BaseDoc):
page_number: int
segment_type: str # 'text', 'image', 'table', or 'hybrid'
content: Optional[str]
image: Optional[ImageTensor]
position: Dict[str, int] # {x, y, width, height}
relationships: Dict[str, Optional[str]] # {'prev': id, 'next': id, 'parent': id}
embedding: Optional[NdArray[768]]
class PDFPage(BaseDoc):
page_number: int
screenshot: ImageTensor
embedding: Optional[NdArray[768]] = None
class RichPDFDocument(BaseDoc):
file_path: str
num_pages: int
segments: DocList[PDFSegment]
pages: DocList[PDFPage]