Spaces:
Runtime error
Runtime error
# encoding: utf-8 | |
import os | |
import tqdm | |
from bs4 import BeautifulSoup as bs | |
import urllib.request | |
import json | |
import datetime | |
import pytz | |
def _download_new_papers(field_abbr): | |
NEW_SUB_URL = f'https://arxiv.org/list/{field_abbr}/new' # https://arxiv.org/list/cs/new | |
page = urllib.request.urlopen(NEW_SUB_URL) | |
soup = bs(page) | |
content = soup.body.find("div", {'id': 'content'}) | |
# find the first h3 element in content | |
h3 = content.find("h3").text # e.g: New submissions for Wed, 10 May 23 | |
date = h3.replace("New submissions for", "").strip() | |
dt_list = content.dl.find_all("dt") | |
dd_list = content.dl.find_all("dd") | |
arxiv_base = "https://arxiv.org/abs/" | |
assert len(dt_list) == len(dd_list) | |
new_paper_list = [] | |
for i in tqdm.tqdm(range(len(dt_list))): | |
paper = {} | |
paper_number = dt_list[i].text.strip().split(" ")[2].split(":")[-1] | |
paper['main_page'] = arxiv_base + paper_number | |
paper['pdf'] = arxiv_base.replace('abs', 'pdf') + paper_number | |
paper['title'] = dd_list[i].find("div", {"class": "list-title mathjax"}).text.replace("Title: ", "").strip() | |
paper['authors'] = dd_list[i].find("div", {"class": "list-authors"}).text \ | |
.replace("Authors:\n", "").replace("\n", "").strip() | |
paper['subjects'] = dd_list[i].find("div", {"class": "list-subjects"}).text.replace("Subjects: ", "").strip() | |
paper['abstract'] = dd_list[i].find("p", {"class": "mathjax"}).text.replace("\n", " ").strip() | |
new_paper_list.append(paper) | |
# check if ./data exist, if not, create it | |
if not os.path.exists("./data"): | |
os.makedirs("./data") | |
# save new_paper_list to a jsonl file, with each line as the element of a dictionary | |
date = datetime.date.fromtimestamp(datetime.datetime.now(tz=pytz.timezone("America/New_York")).timestamp()) | |
date = date.strftime("%a, %d %b %y") | |
with open(f"./data/{field_abbr}_{date}.jsonl", "w") as f: | |
for paper in new_paper_list: | |
f.write(json.dumps(paper) + "\n") | |
def get_papers(field_abbr, limit=None): | |
date = datetime.date.fromtimestamp(datetime.datetime.now(tz=pytz.timezone("America/New_York")).timestamp()) | |
date = date.strftime("%a, %d %b %y") | |
if not os.path.exists(f"./data/{field_abbr}_{date}.jsonl"): | |
_download_new_papers(field_abbr) | |
results = [] | |
with open(f"./data/{field_abbr}_{date}.jsonl", "r") as f: | |
for i, line in enumerate(f.readlines()): | |
if limit and i == limit: | |
return results | |
results.append(json.loads(line)) | |
return results | |