Spaces:
Sleeping
Sleeping
import os | |
from bs4 import BeautifulSoup | |
from document_qa.grobid_processors import get_xml_nodes_body, get_xml_nodes_figures, get_xml_nodes_header | |
from tests.resources import TEST_DATA_PATH | |
def test_get_xml_nodes_body_paragraphs(): | |
with open(os.path.join(TEST_DATA_PATH, "2312.07559.paragraphs.tei.xml"), 'r') as fo: | |
soup = BeautifulSoup(fo, 'xml') | |
nodes = get_xml_nodes_body(soup, use_paragraphs=True) | |
assert len(nodes) == 70 | |
def test_get_xml_nodes_body_sentences(): | |
with open(os.path.join(TEST_DATA_PATH, "2312.07559.sentences.tei.xml"), 'r') as fo: | |
soup = BeautifulSoup(fo, 'xml') | |
children = get_xml_nodes_body(soup, use_paragraphs=False) | |
assert len(children) == 327 | |
def test_get_xml_nodes_figures(): | |
with open(os.path.join(TEST_DATA_PATH, "2312.07559.paragraphs.tei.xml"), 'r') as fo: | |
soup = BeautifulSoup(fo, 'xml') | |
children = get_xml_nodes_figures(soup) | |
assert len(children) == 13 | |
def test_get_xml_nodes_header_paragraphs(): | |
with open(os.path.join(TEST_DATA_PATH, "2312.07559.paragraphs.tei.xml"), 'r') as fo: | |
soup = BeautifulSoup(fo, 'xml') | |
children = get_xml_nodes_header(soup) | |
assert sum([len(child) for k, child in children.items()]) == 8 | |
def test_get_xml_nodes_header_sentences(): | |
with open(os.path.join(TEST_DATA_PATH, "2312.07559.sentences.tei.xml"), 'r') as fo: | |
soup = BeautifulSoup(fo, 'xml') | |
children = get_xml_nodes_header(soup, use_paragraphs=False) | |
assert sum([len(child) for k, child in children.items()]) == 15 | |