Spaces:

quantaji
/

document2slide-demo

Sleeping

App Files Files Community

document2slide-demo / core /read_pdf.py

quantaji

small fix

2d902f4 over 1 year ago

raw

history blame

3.83 kB

	# add module
	import os
	import shutil
	import sys
	from subprocess import call

	from grobid_client.grobid_client import GrobidClient

	module_path = os.path.abspath(os.path.join('/project'))
	if module_path not in sys.path:
	sys.path.append(module_path)

	from core.tei import single_entry

	temp_dir = '/project/temp'
	pdffigures2_home = '/opt/pdffigures2'
	grobid_home = '/opt/grobid'
	grobid_python_config_pth = '/opt/grobid_client_python/config.json'


	def remove_temp_directory():
	if os.path.exists(temp_dir):
	shutil.rmtree(temp_dir)


	def grobid_clident():
	return GrobidClient(config_path=grobid_python_config_pth)


	def process_pdf(pdf_pth: str, file_name: str):
	"""This function will preprocess pdf, generate xml, extract figures, and then move all things to /project/temp"""

	client = grobid_clident()
	remove_temp_directory()

	name = file_name[:-4]

	if not os.path.exists(temp_dir):
	os.makedirs(temp_dir)
	temp_pdf_dir = os.path.join(temp_dir, name, 'pdf')
	if not os.path.exists(temp_pdf_dir):
	os.makedirs(temp_pdf_dir)
	temp_xml_dir = os.path.join(temp_dir, name, 'xml')
	if not os.path.exists(temp_xml_dir):
	os.makedirs(temp_xml_dir)

	# copy pdf to temp dir
	shutil.copy(pdf_pth, temp_pdf_dir)

	# process to xml
	client.process(
	'processFulltextDocument',
	temp_pdf_dir,
	tei_coordinates=True,
	force=True,
	verbose=True,
	output=temp_xml_dir,
	)

	xml_name = name + '.tei.xml'
	xml_pth = os.path.join(temp_xml_dir, xml_name)

	# now scan figures
	fig_dir_profix = 'figure'
	img_dir_profix = 'figure/image'
	json_dir_profix = 'figure/json'

	tmp_fig_dir = os.path.join(pdffigures2_home, fig_dir_profix)
	if not os.path.exists(tmp_fig_dir):
	os.makedirs(tmp_fig_dir)
	tmp_img_dir = os.path.join(pdffigures2_home, img_dir_profix)
	if not os.path.exists(tmp_img_dir):
	os.makedirs(tmp_img_dir)
	tmp_json_dir = os.path.join(pdffigures2_home, json_dir_profix)
	if not os.path.exists(tmp_json_dir):
	os.makedirs(tmp_json_dir)

	args = [
	'sbt',
	'-J-Xmx4G',
	'runMain org.allenai.pdffigures2.FigureExtractorBatchCli -e -q ' + os.path.abspath(temp_pdf_dir) + '/' + ' -m ' + './' + img_dir_profix + '/' + ' -d ' + './' + json_dir_profix + '/' + ' -s ' + './' + fig_dir_profix + '/stat.json',
	]
	call(args, cwd=pdffigures2_home)

	shutil.move(tmp_fig_dir, os.path.join(temp_dir, name))

	figure_json_pth = os.path.join(temp_dir, name, 'figure/json', name + '.json')

	# merge to single json
	_, title, abstract, text, headers, figures = single_entry('', xml_pth=xml_pth, fig_json_pth=figure_json_pth)

	temp_json_dir = os.path.join(temp_dir, name, 'json')
	if not os.path.exists(temp_json_dir):
	os.makedirs(temp_json_dir)

	json_data = {
	'title': title,
	'abstract': abstract,
	'text': text,
	'headers': headers,
	'figures': figures,
	}

	import json
	json_pth = os.path.join(temp_json_dir, name + '.json')
	with open(json_pth, 'w') as f:
	json.dump(json_data, f, indent=4)

	# get preprocessed data
	with open(json_pth, 'r') as f:
	data = json.load(f)
	paper_length = len(data['text'])
	sections = [{
	'idx': i,
	'title': head['section'],
	'n': head['n'],
	'text': ' '.join([data['text'][idx]['string'] for idx in range(head['start'], min(head['end'] + 1, paper_length))]),
	'matched_slides': [],
	} for i, head in enumerate(data['headers'])]

	with open(os.path.join(temp_dir, name, name + '.preprocessed_text.json'), 'w') as f:
	json.dump([sec['text'] for sec in sections], f, indent=4)


	if __name__ == '__main__':
	process_pdf('/project/example/example.pdf')