Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import dataclasses | |
import datetime | |
import operator | |
import pathlib | |
import pandas as pd | |
import requests | |
import tqdm.auto | |
class PaperInfo: | |
date: str | |
arxiv_id: str | |
github: str | |
title: str | |
paper_page: str | |
upvotes: int | |
published_at: str | |
def __post_init__(self): | |
object.__setattr__(self, "published_at", PaperInfo.convert_timestamp(self.published_at)) | |
def convert_timestamp(timestamp: str) -> str: | |
try: | |
return datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ").strftime("%Y/%m/%d %H:%M:%S") | |
except ValueError: | |
return timestamp | |
def get_df(path: pathlib.Path | str) -> pd.DataFrame: | |
df = pd.read_csv(path, dtype=str).fillna("") | |
paper_info = [] | |
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)): | |
res = requests.get(f"https://huggingface.co/api/papers/{row.arxiv_id}").json() | |
info = PaperInfo( | |
**row, | |
title=res["title"], | |
paper_page=f"https://huggingface.co/papers/{row.arxiv_id}", | |
upvotes=res["upvotes"], | |
published_at=res["publishedAt"], | |
) | |
paper_info.append(info) | |
return pd.DataFrame([dataclasses.asdict(info) for info in paper_info]) | |
class Prettifier: | |
def get_github_link(link: str) -> str: | |
if not link: | |
return "" | |
return Prettifier.create_link("github", link) | |
def create_link(text: str, url: str) -> str: | |
return f'<a href="{url}" target="_blank">{text}</a>' | |
def to_div(text: str | None, category_name: str) -> str: | |
if text is None: | |
text = "" | |
class_name = f"{category_name}-{text.lower()}" | |
return f'<div class="{class_name}">{text}</div>' | |
def __call__(self, df: pd.DataFrame) -> pd.DataFrame: | |
df = df.sort_values("arxiv_id", ascending=False).reset_index(drop=True) | |
new_rows = [] | |
for _, row in df.iterrows(): | |
new_row = dict(row) | { | |
"date": Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}"), | |
"paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page), | |
"github": self.get_github_link(row.github), | |
} | |
new_rows.append(new_row) | |
return pd.DataFrame(new_rows, columns=df.columns) | |
class PaperList: | |
COLUMN_INFO = [ | |
["date", "markdown"], | |
["paper_page", "markdown"], | |
["title", "str"], | |
["github", "markdown"], | |
["upvotes", "number"], | |
] | |
def __init__(self, df: pd.DataFrame): | |
self.df_raw = df | |
self._prettifier = Prettifier() | |
self.df_prettified = self._prettifier(df).loc[:, self.column_names] | |
def column_names(self): | |
return list(map(operator.itemgetter(0), self.COLUMN_INFO)) | |
def column_datatype(self): | |
return list(map(operator.itemgetter(1), self.COLUMN_INFO)) | |