Spaces:
Runtime error
Runtime error
File size: 3,539 Bytes
e2e8616 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import docx
import os
# sys.path.append('path to app')
# import docx
# import os
# import sys
from src.model.paragraph import Paragraph
class WordReader:
def __init__(self, path):
self.path = path
self.paragraphs = self.get_word_paragraphs()
def get_word_paragraphs(self):
"""
Fetches paragraphs from a Word document.
Returns:
list: List of Paragraph objects from the document.
"""
if not os.path.exists(self.path):
raise FileNotFoundError(f"The file {self.path} does not exist.")
try:
doc = docx.Document(self.path)
paragraphs = self.to_paragraph_objects(doc.paragraphs) # Convert to Paragraph objects
return paragraphs
except Exception as e:
raise ValueError(f"Error reading the .docx file. Original error: {str(e)}")
def determine_style(self, paragraph):
"""
Determines the style of the paragraph based on its attributes.
Returns:
str: Style of the paragraph.
"""
# Check for heading styles first
if paragraph.style.name.startswith('Heading 1'):
return "title1"
elif paragraph.style.name.startswith('Heading 2'):
return "title2"
elif paragraph.style.name.startswith('Heading 3'):
return "title3"
elif paragraph.style.name.startswith('Heading 4'):
return "title4"
elif paragraph.style.name.startswith('Heading 5'):
return "title5"
# If not a heading, check the runs within the paragraph
for run in paragraph.runs:
font = run.font
fontname = font.name
size = font.size
# Convert size to points (from twips)
if size:
size_in_points = size.pt
# Map based on font name and size as in the PDF reader
if fontname == "XFQKGD+Consolas":
return "code"
elif (size_in_points >= 9 and size_in_points < 11.5) or fontname == "Wingdings-Regular":
return "content"
# If none of the above conditions match, default to 'content'
return "content"
def to_paragraph_objects(self, doc_paragraphs):
"""
Convert docx paragraphs to Paragraph objects for further processing.
"""
paragraph_objects = []
for idx, paragraph in enumerate(doc_paragraphs):
style = self.determine_style(paragraph)
# Assuming page_id is always 1 for simplicity, change as needed.
p_obj = Paragraph(text=paragraph.text, font_style=style, id_=idx, page_id=1)
paragraph_objects.append(p_obj)
paragraphs = self.rearrange_paragraphs(paragraph_objects)
return paragraphs
def rearrange_paragraphs(self, paragraphs : [Paragraph]):
#associate paragraphs with the same font style
i = 0
while i < len(paragraphs):
paragraphs[i] = paragraphs[i].rearrange_paragraph()
i+=1
return paragraphs
def display_paragraphs(self):
"""
Prints the paragraphs from the document to the console.
"""
for paragraph in self.paragraphs:
print(paragraph.text)
print('-' * 40) # separator for clarity
# if __name__ == '__main__':
# reader = WordReader("Illumio_Core_REST_API_Developer_Guide_23.3.docx")
# reader.display_paragraphs()
|