Document_QnA / src /tools /reader_html.py
Quent1Fvr's picture
v2.
e2e8616
raw
history blame
4.57 kB
from pyquery import PyQuery as pq
from src.model.paragraph import Paragraph
from bs4 import BeautifulSoup
from src.tools.readers_pdf import Reader_illumio
from src.tools.table_converter import table_converter
class Reader_HTML:
def __init__(self, path):
self.path = path
self.paragraphs = self.read_html_2(path)
#without beautifulsoup but doesn't work fine
def read_html(self, path):
with open(path, 'r') as html_file:
doc = pq(html_file.read())
# Remove script and style elements
doc('script').remove()
doc('style').remove()
paragraphs = []
for index, elem in enumerate(doc('*')):
# Check if the element is a leaf (does not contain other elements)
if not pq(elem).find('*'):
text = pq(elem).text().strip()
if text:
paragraphs.append(Paragraph(text=text, font_style=elem.tag, id_ = index, page_id=1))
return paragraphs
#with beautifulsoup
def read_html_2(self,path):
HTMLFile = open(path, "r")
# Reading the file
reader = HTMLFile.read()
paragraphs = []
# Creating a BeautifulSoup object and specifying the parser
S = BeautifulSoup(reader, 'html.parser')
for tag in S(['style', 'script', 'footer', 'header', 'nav', 'aside', 'form']):
tag.decompose()
# Get all elements that do not contain other elements
leaf_elements = [elem for elem in S.body.descendants if elem.name is not None and not elem.find_all()]
paragraphs = []
for index, elem in enumerate(leaf_elements):
text = elem.get_text(strip=True, separator='\n')
if text:
p = Paragraph(text=text, font_style=elem.name, id_ = index, page_id=1)
paragraphs.append(p)
paragraphs = self.concatenate_paragraphs_with_same_font_style(paragraphs)
paragraphs = [p.rearrange_paragraph() for p in paragraphs]
return paragraphs
def concatenate_paragraphs_with_same_font_style(self,paragraphs: [Paragraph]):
i = 0
while i < len(paragraphs)-1:
if paragraphs[i].font_style == "th":
paragraphs = self.create_table(paragraphs,i)
i += 1
elif paragraphs[i].font_style == "li":
paragraphs,i = self.create_list(paragraphs,i)
i += 1
elif paragraphs[i].font_style == paragraphs[i+1].font_style:
paragraphs[i].text += "\n" + paragraphs[i+1].text
paragraphs.pop(i+1)
else:
i += 1
return paragraphs
def create_table(self, paragraphs, i: int):
table = []
titles = []
content = []
while i < len(paragraphs) and paragraphs[i].font_style == "th":
titles.append(paragraphs[i].text)
paragraphs.pop(i)
table.append(titles)
length = len(titles)
temp = 0
while i < len(paragraphs) and paragraphs[i].font_style == "td":
if temp == length:
temp = 0
content.append(paragraphs[i].text)
table.append(content)
content = []
else:
content.append(paragraphs[i].text)
paragraphs.pop(i)
temp += 1
table.append(content)
paragraphs.insert(i,Paragraph(table_converter(table),font_style="table",id_=i,page_id=1))
return paragraphs
def create_list(self, paragraphs, i: int):
list_content = []
while i < len(paragraphs) and paragraphs[i].font_style in ["ul", "ol", "li"]:
if paragraphs[i].font_style == "li":
list_content.append(paragraphs[i].text)
paragraphs.pop(i)
elif paragraphs[i].font_style in ["ul", "ol"]:
sublist, i = self.create_list(paragraphs, i+1)
list_content.append(sublist)
else:
i += 1
list_paragraph = Paragraph(text=self.format_list(list_content), font_style="list", id_=i, page_id=1)
paragraphs.insert(i, list_paragraph)
return paragraphs, i
def format_list(self,list_content):
res = ""
for i in range(len(list_content)):
if type(list_content[i]) == str:
res += f"{i+1}. {list_content[i]}\n"
else:
res += f"{i+1}. {self.format_list(list_content[i])}\n"
return res