Spaces:
Runtime error
Runtime error
File size: 2,655 Bytes
6eff5e7 2fad322 6eff5e7 2fad322 6eff5e7 963bf46 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import numpy as np
from pypdf import PdfReader
from urllib.parse import urlparse
import requests
from semanticscholar import SemanticScholar
### Input Formatting Module
## Input formatting for the given paper
# Extracting text from a pdf or a link
def get_text_from_pdf(file_path):
"""
Convert a pdf to list of text files
"""
reader = PdfReader(file_path)
text = []
for p in reader.pages:
t = p.extract_text()
text.append(t)
return text
def get_text_from_url(url, file_path='paper.pdf'):
"""
Get text of the paper from a url
"""
## Check for different URL cases
url_parts = urlparse(url)
# arxiv
if 'arxiv' in url_parts.netloc:
if 'abs' in url_parts.path:
# abstract page, change the url to pdf link
paper_id = url_parts.path.split('/')[-1]
url = 'https://www.arxiv.org/pdf/%s.pdf'%(paper_id)
elif 'pdf' in url_parts.path:
# pdf file, pass
pass
else:
raise ValueError('invalid url')
else:
raise ValueError('invalid url')
# download the file
download_pdf(url, file_path)
# get the text from the pdf file
text = get_text_from_pdf(file_path)
return text
def download_pdf(url, file_name):
"""
Download the pdf file from given url and save it as file_name
"""
# Send GET request
response = requests.get(url)
# Save the PDF
if response.status_code == 200:
with open(file_name, "wb") as f:
f.write(response.content)
elif response.status_code == 404:
raise ValueError('cannot download the file')
else:
print(response.status_code)
## Input formatting for the given author (reviewer)
# Extracting text from a link
def get_text_from_author_id(author_id, max_count=100):
if author_id is None:
raise ValueError('Input valid author ID')
aid = str(author_id)
if 'http' in aid: # handle semantic scholar url input
aid = aid.split('/')
aid = aid[aid.index('author')+2]
url = "https://api.semanticscholar.org/graph/v1/author/%s?fields=url,name,paperCount,papers,papers.title,papers.abstract"%aid
r = requests.get(url)
if r.status_code == 404:
raise ValueError('Author link not found.')
data = r.json()
papers = data['papers'][:max_count]
name = data['name']
return name, papers
## TODO Preprocess Extracted Texts from PDFs
# Get a portion of the text for actual task
def get_title(text):
pass
def get_abstract(text):
pass
def get_introduction(text):
pass
def get_conclusion(text):
pass |