Spaces:

chansung
/

paper_qa

Running on CPU Upgrade

App Files Files Community

paper_qa / paper /download.py

chansung

update

928f123 8 months ago

raw

history blame

3.66 kB

	import re
	import json
	import requests
	import datetime
	from datetime import date
	from datetime import datetime
	import xml.etree.ElementTree as ET
	from requests.exceptions import HTTPError

	def _get_today():
	return str(date.today())

	def _download_pdf_from_arxiv(filename):
	url = f'https://arxiv.org/pdf/{filename}'
	response = requests.get(url)
	if response.status_code == 200:
	return response.content
	else:
	raise Exception(f"Failed to download pdf for arXiv id {filename}")

	def download_pdf_from_arxiv(arxiv_id):
	filename = f"{arxiv_id}.pdf"
	pdf_content = _download_pdf_from_arxiv(filename)

	# Save the pdf content to a file
	with open(filename, "wb") as f:
	f.write(pdf_content)

	return filename

	def _get_papers_from_hf_daily_papers(target_date):
	if target_date is None:
	target_date = _get_today()
	print(f"target_date is not set => scrap today's papers [{target_date}]")
	url = f"https://huggingface.co/api/daily_papers?date={target_date}"

	response = requests.get(url)

	if response.status_code == 200:
	return target_date, response.text
	else:
	raise HTTPError(f"Error fetching data. Status code: {response.status_code}")

	def get_papers_from_hf_daily_papers(target_date):
	target_date, results = _get_papers_from_hf_daily_papers(target_date)
	results = json.loads(results)
	for result in results:
	result["target_date"] = target_date
	return target_date, results


	def _get_paper_xml_by_arxiv_id(arxiv_id):
	url = f"http://export.arxiv.org/api/query?search_query=id:{arxiv_id}&start=0&max_results=1"
	return requests.get(url)

	def _is_arxiv_id_valid(arxiv_id):
	pattern = r"^\d{4}\.\d{5}$"
	return bool(re.match(pattern, arxiv_id))

	def _get_paper_metadata_by_arxiv_id(response):
	root = ET.fromstring(response.content)

	# Example: Extracting title, authors, and abstract
	title = root.find('{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}title').text
	authors = [author.find('{http://www.w3.org/2005/Atom}name').text for author in root.findall('{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}author')]
	abstract = root.find('{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}summary').text
	target_date = root.find('{http://www.w3.org/2005/Atom}entry/{http://www.w3.org/2005/Atom}published').text

	return title, authors, abstract, target_date

	def get_papers_from_arxiv_ids(arxiv_ids):
	results = []

	for arxiv_id in arxiv_ids:
	print(arxiv_id)
	if _is_arxiv_id_valid(arxiv_id):
	try:
	xml_data = _get_paper_xml_by_arxiv_id(arxiv_id)
	title, authors, abstract, target_date = _get_paper_metadata_by_arxiv_id(xml_data)

	datetime_obj = datetime.strptime(target_date, "%Y-%m-%dT%H:%M:%SZ")
	formatted_date = datetime_obj.strftime("%Y-%m-%d")

	results.append(
	{
	"title": title,
	"target_date": formatted_date,
	"paper": {
	"summary": abstract,
	"id": arxiv_id,
	"authors" : authors,
	}
	}
	)
	except:
	print("......something wrong happend when downloading metadata")
	print("......this usually happens when you try out the today's published paper")
	continue
	else:
	print(f"......not a valid arXiv ID[{arxiv_id}]")

	return results