Spaces:
Running
Running
get data availability statement as context for QA
Browse files
document_qa/grobid_processors.py
CHANGED
@@ -183,6 +183,7 @@ class GrobidProcessor(BaseProcessor):
|
|
183 |
})
|
184 |
|
185 |
text_blocks_body = get_xml_nodes_body(soup, verbose=False, use_paragraphs=True)
|
|
|
186 |
|
187 |
use_paragraphs = True
|
188 |
if not use_paragraphs:
|
@@ -800,6 +801,20 @@ def get_xml_nodes_body(soup: object, use_paragraphs: bool = True, verbose: bool
|
|
800 |
return nodes
|
801 |
|
802 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
803 |
def get_xml_nodes_figures(soup: object, verbose: bool = False) -> list:
|
804 |
children = []
|
805 |
for child in soup.TEI.children:
|
|
|
183 |
})
|
184 |
|
185 |
text_blocks_body = get_xml_nodes_body(soup, verbose=False, use_paragraphs=True)
|
186 |
+
text_blocks_body.extend(get_xml_nodes_back(soup, verbose=False, use_paragraphs=True))
|
187 |
|
188 |
use_paragraphs = True
|
189 |
if not use_paragraphs:
|
|
|
801 |
return nodes
|
802 |
|
803 |
|
804 |
+
def get_xml_nodes_back(soup: object, use_paragraphs: bool = True, verbose: bool = False) -> list:
|
805 |
+
nodes = []
|
806 |
+
tag_name = "p" if use_paragraphs else "s"
|
807 |
+
for child in soup.TEI.children:
|
808 |
+
if child.name == 'text':
|
809 |
+
nodes.extend(
|
810 |
+
[subsubchild for subchild in child.find_all("back") for subsubchild in subchild.find_all(tag_name)])
|
811 |
+
|
812 |
+
if verbose:
|
813 |
+
print(str(nodes))
|
814 |
+
|
815 |
+
return nodes
|
816 |
+
|
817 |
+
|
818 |
def get_xml_nodes_figures(soup: object, verbose: bool = False) -> list:
|
819 |
children = []
|
820 |
for child in soup.TEI.children:
|