import dataclasses import datetime import operator import pathlib import pandas as pd import requests import tqdm.auto @dataclasses.dataclass(frozen=True) class PaperInfo: arxiv_id: str github: str title: str paper_page: str upvotes: int published_at: str def __post_init__(self): object.__setattr__(self, 'published_at', PaperInfo.convert_timestamp(self.published_at)) @staticmethod def convert_timestamp(timestamp: str) -> str: try: return datetime.datetime.strptime( timestamp, '%Y-%m-%dT%H:%M:%S.%fZ').strftime('%Y/%m/%d %H:%M:%S') except ValueError: return timestamp def get_df(path: pathlib.Path | str) -> pd.DataFrame: df = pd.read_csv(path, dtype=str).fillna('') paper_info = [] for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)): res = requests.get( f'https://huggingface.co/api/papers/{row.arxiv_id}').json() info = PaperInfo( **row, title=res['title'], paper_page=f'https://huggingface.co/papers/{row.arxiv_id}', upvotes=res['upvotes'], published_at=res['publishedAt']) paper_info.append(info) return pd.DataFrame([dataclasses.asdict(info) for info in paper_info]) class Prettifier: @staticmethod def get_paper_page_link(link: str) -> str: return Prettifier.create_link(link.split('/')[-1], link) @staticmethod def get_github_link(link: str) -> str: if not link: return '' return Prettifier.create_link('github', link) @staticmethod def create_link(text: str, url: str) -> str: return f'<a href={url} target="_blank">{text}</a>' @staticmethod def to_div(text: str | None, category_name: str) -> str: if text is None: text = '' class_name = f'{category_name}-{text.lower()}' return f'<div class="{class_name}">{text}</div>' def __call__(self, df: pd.DataFrame) -> pd.DataFrame: df = df.sort_values('arxiv_id', ascending=False).reset_index(drop=True) new_rows = [] for _, row in df.iterrows(): new_row = dict(row) | { 'paper_page': self.get_paper_page_link(row.paper_page), 'github': self.get_github_link(row.github), } new_rows.append(new_row) return pd.DataFrame(new_rows, columns=df.columns) class PaperList: COLUMN_INFO = [ ['paper_page', 'markdown'], ['title', 'str'], ['github', 'markdown'], ['upvotes', 'number'], ] def __init__(self, df: pd.DataFrame): self.df_raw = df self._prettifier = Prettifier() self.df_prettified = self._prettifier(df).loc[:, self.column_names] @property def column_names(self): return list(map(operator.itemgetter(0), self.COLUMN_INFO)) @property def column_datatype(self): return list(map(operator.itemgetter(1), self.COLUMN_INFO))