import docx import os # sys.path.append('path to app') # import docx # import os # import sys from src.model.paragraph import Paragraph class WordReader: def __init__(self, path): self.path = path self.paragraphs = self.get_word_paragraphs() def get_word_paragraphs(self): """ Fetches paragraphs from a Word document. Returns: list: List of Paragraph objects from the document. """ if not os.path.exists(self.path): raise FileNotFoundError(f"The file {self.path} does not exist.") try: doc = docx.Document(self.path) paragraphs = self.to_paragraph_objects(doc.paragraphs) # Convert to Paragraph objects return paragraphs except Exception as e: raise ValueError(f"Error reading the .docx file. Original error: {str(e)}") def determine_style(self, paragraph): """ Determines the style of the paragraph based on its attributes. Returns: str: Style of the paragraph. """ # Check for heading styles first if paragraph.style.name.startswith('Heading 1'): return "title1" elif paragraph.style.name.startswith('Heading 2'): return "title2" elif paragraph.style.name.startswith('Heading 3'): return "title3" elif paragraph.style.name.startswith('Heading 4'): return "title4" elif paragraph.style.name.startswith('Heading 5'): return "title5" # If not a heading, check the runs within the paragraph for run in paragraph.runs: font = run.font fontname = font.name size = font.size # Convert size to points (from twips) if size: size_in_points = size.pt # Map based on font name and size as in the PDF reader if fontname == "XFQKGD+Consolas": return "code" elif (size_in_points >= 9 and size_in_points < 11.5) or fontname == "Wingdings-Regular": return "content" # If none of the above conditions match, default to 'content' return "content" def to_paragraph_objects(self, doc_paragraphs): """ Convert docx paragraphs to Paragraph objects for further processing. """ paragraph_objects = [] for idx, paragraph in enumerate(doc_paragraphs): style = self.determine_style(paragraph) # Assuming page_id is always 1 for simplicity, change as needed. p_obj = Paragraph(text=paragraph.text, font_style=style, id_=idx, page_id=1) paragraph_objects.append(p_obj) paragraphs = self.rearrange_paragraphs(paragraph_objects) return paragraphs def rearrange_paragraphs(self, paragraphs : [Paragraph]): #associate paragraphs with the same font style i = 0 while i < len(paragraphs): paragraphs[i] = paragraphs[i].rearrange_paragraph() i+=1 return paragraphs def display_paragraphs(self): """ Prints the paragraphs from the document to the console. """ for paragraph in self.paragraphs: print(paragraph.text) print('-' * 40) # separator for clarity # if __name__ == '__main__': # reader = WordReader("Illumio_Core_REST_API_Developer_Guide_23.3.docx") # reader.display_paragraphs()