Spaces:
Sleeping
Sleeping
# code modified from https://github.com/IBM/document2slides/blob/main/sciduet-build/extract_papers.py | |
import os | |
from os import path | |
import json | |
from dataclasses import dataclass | |
from multiprocessing.pool import Pool | |
import pandas as pd | |
from pathlib import Path | |
from bs4 import BeautifulSoup | |
import nltk | |
from dataclasses import dataclass | |
class Person: | |
firstname: str | |
middlename: str | |
surname: str | |
def read_tei(tei_file): | |
with open(tei_file, 'r', encoding='utf-8') as tei: | |
soup = BeautifulSoup(tei, 'xml') | |
return soup | |
def elem_to_text(elem, default=''): | |
if elem: | |
return elem.getText() | |
else: | |
return default | |
class TEIFile(object): | |
def __init__(self, xml_pth: str = None, fig_json_pth: str = ''): | |
# self.filename = filename | |
# self.dir = os.path.abspath(paper_dir) | |
self.xml_pth = os.path.abspath(xml_pth) | |
self.fig_json_pth = os.path.abspath(fig_json_pth) | |
# self.soup = read_tei(os.path.join(self.dir, 'paper.tei.xml')) | |
self.soup = read_tei(self.xml_pth) | |
self._text = None | |
self._title = '' | |
self._abstract = '' | |
self._headers = None | |
self._figures = None | |
def doi(self): | |
idno_elem = self.soup.find('idno', type='DOI') | |
if not idno_elem: | |
return '' | |
else: | |
return idno_elem.getText() | |
def title(self): | |
if not self._title: | |
self._title = self.soup.title.getText() | |
return self._title | |
def abstract(self): | |
if not self._abstract: | |
abstract = self.soup.abstract.getText(separator=' ', strip=True) | |
self._abstract = abstract | |
return self._abstract | |
def authors(self): | |
authors_in_header = self.soup.analytic.find_all('author') | |
# print(authors_in_header) | |
result = [] | |
for author in authors_in_header: | |
persname = author.persName | |
if not persname: | |
persname = author.persname | |
if not persname: | |
continue | |
firstname = elem_to_text(persname.find("forename", type="first")) | |
middlename = elem_to_text(persname.find("forename", type="middle")) | |
surname = elem_to_text(persname.surname) | |
person = Person(firstname, middlename, surname) | |
result.append(person) | |
return result | |
def text(self): | |
if not self._text: | |
self._headers = [] | |
headerlist = self.soup.body.find_all("head") | |
sections = [] | |
print(headerlist) | |
for head in headerlist: | |
if head.parent.name == 'div': | |
txt = head.parent.get_text(separator=' ', strip=True) | |
# the following is only valid for arabic numerals... | |
if head.get("n"): | |
sections.append([head.text, head.get('n'), txt]) | |
else: | |
if len(sections) == 0: | |
print("Grobid processing error.") | |
sections[-1][2] += txt | |
# sections.append([head.text, 'invalid n', txt]) | |
start = 0 | |
for i in sections: | |
sent = nltk.tokenize.sent_tokenize(i[2]) | |
sec_dic = {'section': i[0], 'n': i[1], 'start': start, 'end': start + len(sent) - 1} | |
self._headers.append(sec_dic) | |
start += len(sent) | |
plain_text = " ".join([i[2] for i in sections]) | |
self._text = [{'id': i, 'string': s} for i, s in enumerate(nltk.tokenize.sent_tokenize(plain_text))] | |
return self._text | |
def headers(self): | |
if not self._headers: | |
self.text() | |
return self._headers | |
def figures(self): | |
if not self._figures: | |
# base_name = basename_without_ext(self.filename) | |
self._figures = [] | |
# fn = 'figures/{}.json'.format(base_name) # link to figures dir | |
# fn = os.path.join(self.dir, 'paper_figure/paper.json') | |
fn = self.fig_json_pth | |
if not path.isfile(fn): | |
return [] | |
with open(fn) as f: | |
data = json.load(f) | |
for i in data: | |
elem = {'filename': i['renderURL'], 'caption': i['caption'], 'page': i['page'], 'bbox': i['regionBoundary']} | |
self._figures.append(elem) | |
return self._figures | |
def single_entry(uuid: str, xml_pth: str, fig_json_pth: str): | |
tei = TEIFile(xml_pth=xml_pth, fig_json_pth=fig_json_pth) | |
return uuid, tei.title, tei.abstract, tei.text, tei.headers, tei.figures | |