from pyquery import PyQuery as pq from src.model.paragraph import Paragraph from bs4 import BeautifulSoup from src.tools.readers_pdf import Reader_illumio from src.tools.table_converter import table_converter class Reader_HTML: def __init__(self, path): self.path = path self.paragraphs = self.read_html_2(path) #without beautifulsoup but doesn't work fine def read_html(self, path): with open(path, 'r') as html_file: doc = pq(html_file.read()) # Remove script and style elements doc('script').remove() doc('style').remove() paragraphs = [] for index, elem in enumerate(doc('*')): # Check if the element is a leaf (does not contain other elements) if not pq(elem).find('*'): text = pq(elem).text().strip() if text: paragraphs.append(Paragraph(text=text, font_style=elem.tag, id_ = index, page_id=1)) return paragraphs #with beautifulsoup def read_html_2(self,path): HTMLFile = open(path, "r") # Reading the file reader = HTMLFile.read() paragraphs = [] # Creating a BeautifulSoup object and specifying the parser S = BeautifulSoup(reader, 'html.parser') for tag in S(['style', 'script', 'footer', 'header', 'nav', 'aside', 'form']): tag.decompose() # Get all elements that do not contain other elements leaf_elements = [elem for elem in S.body.descendants if elem.name is not None and not elem.find_all()] paragraphs = [] for index, elem in enumerate(leaf_elements): text = elem.get_text(strip=True, separator='\n') if text: p = Paragraph(text=text, font_style=elem.name, id_ = index, page_id=1) paragraphs.append(p) paragraphs = self.concatenate_paragraphs_with_same_font_style(paragraphs) paragraphs = [p.rearrange_paragraph() for p in paragraphs] return paragraphs def concatenate_paragraphs_with_same_font_style(self,paragraphs: [Paragraph]): i = 0 while i < len(paragraphs)-1: if paragraphs[i].font_style == "th": paragraphs = self.create_table(paragraphs,i) i += 1 elif paragraphs[i].font_style == "li": paragraphs,i = self.create_list(paragraphs,i) i += 1 elif paragraphs[i].font_style == paragraphs[i+1].font_style: paragraphs[i].text += "\n" + paragraphs[i+1].text paragraphs.pop(i+1) else: i += 1 return paragraphs def create_table(self, paragraphs, i: int): table = [] titles = [] content = [] while i < len(paragraphs) and paragraphs[i].font_style == "th": titles.append(paragraphs[i].text) paragraphs.pop(i) table.append(titles) length = len(titles) temp = 0 while i < len(paragraphs) and paragraphs[i].font_style == "td": if temp == length: temp = 0 content.append(paragraphs[i].text) table.append(content) content = [] else: content.append(paragraphs[i].text) paragraphs.pop(i) temp += 1 table.append(content) paragraphs.insert(i,Paragraph(table_converter(table),font_style="table",id_=i,page_id=1)) return paragraphs def create_list(self, paragraphs, i: int): list_content = [] while i < len(paragraphs) and paragraphs[i].font_style in ["ul", "ol", "li"]: if paragraphs[i].font_style == "li": list_content.append(paragraphs[i].text) paragraphs.pop(i) elif paragraphs[i].font_style in ["ul", "ol"]: sublist, i = self.create_list(paragraphs, i+1) list_content.append(sublist) else: i += 1 list_paragraph = Paragraph(text=self.format_list(list_content), font_style="list", id_=i, page_id=1) paragraphs.insert(i, list_paragraph) return paragraphs, i def format_list(self,list_content): res = "" for i in range(len(list_content)): if type(list_content[i]) == str: res += f"{i+1}. {list_content[i]}\n" else: res += f"{i+1}. {self.format_list(list_content[i])}\n" return res