Spaces:
Sleeping
Sleeping
File size: 760 Bytes
452c0e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
# pdf_classes.py
from docarray import BaseDoc
from docarray import DocList
from docarray.typing import ImageTensor, NdArray
from typing import Dict, Optional
class PDFSegment(BaseDoc):
page_number: int
segment_type: str # 'text', 'image', 'table', or 'hybrid'
content: Optional[str]
image: Optional[ImageTensor]
position: Dict[str, int] # {x, y, width, height}
relationships: Dict[str, Optional[str]] # {'prev': id, 'next': id, 'parent': id}
embedding: Optional[NdArray[768]]
class PDFPage(BaseDoc):
page_number: int
screenshot: ImageTensor
embedding: Optional[NdArray[768]] = None
class RichPDFDocument(BaseDoc):
file_path: str
num_pages: int
segments: DocList[PDFSegment]
pages: DocList[PDFPage]
|