# add module import os import shutil import sys from subprocess import call from grobid_client.grobid_client import GrobidClient module_path = os.path.abspath(os.path.join('/project')) if module_path not in sys.path: sys.path.append(module_path) from core.tei import single_entry temp_dir = '/project/temp' pdffigures2_home = '/opt/pdffigures2' grobid_home = '/opt/grobid' grobid_python_config_pth = '/opt/grobid_client_python/config.json' def remove_temp_directory(): if os.path.exists(temp_dir): shutil.rmtree(temp_dir) def grobid_clident(): return GrobidClient(config_path=grobid_python_config_pth) def process_pdf(pdf_pth: str, file_name: str): """This function will preprocess pdf, generate xml, extract figures, and then move all things to /project/temp""" client = grobid_clident() remove_temp_directory() name = file_name[:-4] if not os.path.exists(temp_dir): os.makedirs(temp_dir) temp_pdf_dir = os.path.join(temp_dir, name, 'pdf') if not os.path.exists(temp_pdf_dir): os.makedirs(temp_pdf_dir) temp_xml_dir = os.path.join(temp_dir, name, 'xml') if not os.path.exists(temp_xml_dir): os.makedirs(temp_xml_dir) # copy pdf to temp dir shutil.copy(pdf_pth, temp_pdf_dir) # process to xml client.process( 'processFulltextDocument', temp_pdf_dir, tei_coordinates=True, force=True, verbose=True, output=temp_xml_dir, ) xml_name = name + '.tei.xml' xml_pth = os.path.join(temp_xml_dir, xml_name) # now scan figures fig_dir_profix = 'figure' img_dir_profix = 'figure/image' json_dir_profix = 'figure/json' tmp_fig_dir = os.path.join(pdffigures2_home, fig_dir_profix) if not os.path.exists(tmp_fig_dir): os.makedirs(tmp_fig_dir) tmp_img_dir = os.path.join(pdffigures2_home, img_dir_profix) if not os.path.exists(tmp_img_dir): os.makedirs(tmp_img_dir) tmp_json_dir = os.path.join(pdffigures2_home, json_dir_profix) if not os.path.exists(tmp_json_dir): os.makedirs(tmp_json_dir) args = [ 'sbt', '-J-Xmx4G', 'runMain org.allenai.pdffigures2.FigureExtractorBatchCli -e -q ' + os.path.abspath(temp_pdf_dir) + '/' + ' -m ' + './' + img_dir_profix + '/' + ' -d ' + './' + json_dir_profix + '/' + ' -s ' + './' + fig_dir_profix + '/stat.json', ] call(args, cwd=pdffigures2_home) shutil.move(tmp_fig_dir, os.path.join(temp_dir, name)) figure_json_pth = os.path.join(temp_dir, name, 'figure/json', name + '.json') # merge to single json _, title, abstract, text, headers, figures = single_entry('', xml_pth=xml_pth, fig_json_pth=figure_json_pth) temp_json_dir = os.path.join(temp_dir, name, 'json') if not os.path.exists(temp_json_dir): os.makedirs(temp_json_dir) json_data = { 'title': title, 'abstract': abstract, 'text': text, 'headers': headers, 'figures': figures, } import json json_pth = os.path.join(temp_json_dir, name + '.json') with open(json_pth, 'w') as f: json.dump(json_data, f, indent=4) # get preprocessed data with open(json_pth, 'r') as f: data = json.load(f) paper_length = len(data['text']) sections = [{ 'idx': i, 'title': head['section'], 'n': head['n'], 'text': ' '.join([data['text'][idx]['string'] for idx in range(head['start'], min(head['end'] + 1, paper_length))]), 'matched_slides': [], } for i, head in enumerate(data['headers'])] with open(os.path.join(temp_dir, name, name + '.preprocessed_text.json'), 'w') as f: json.dump([sec['text'] for sec in sections], f, indent=4) if __name__ == '__main__': process_pdf('/project/example/example.pdf')