import dataclasses import datetime import operator import datasets import pandas as pd import requests import tqdm.auto @dataclasses.dataclass(frozen=True) class PaperInfo: date: str arxiv_id: str github: str title: str paper_page: str upvotes: int published_at: str def __post_init__(self): object.__setattr__(self, "published_at", PaperInfo.convert_timestamp(self.published_at)) @staticmethod def convert_timestamp(timestamp: str) -> str: try: return datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%fZ").strftime("%Y/%m/%d %H:%M:%S") except ValueError: return timestamp def get_df() -> pd.DataFrame: df = datasets.load_dataset("hysts-internal/daily-papers")["train"].to_pandas() df = df.drop(columns=["title"]) paper_info = [] for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)): res = requests.get(f"https://huggingface.co/api/papers/{row.arxiv_id}").json() info = PaperInfo( **row, title=res["title"], paper_page=f"https://huggingface.co/papers/{row.arxiv_id}", upvotes=res["upvotes"], published_at=res["publishedAt"], ) paper_info.append(info) return pd.DataFrame([dataclasses.asdict(info) for info in paper_info]) class Prettifier: @staticmethod def get_github_link(link: str) -> str: if not link: return "" return Prettifier.create_link("github", link) @staticmethod def create_link(text: str, url: str) -> str: return f'{text}' @staticmethod def to_div(text: str | None, category_name: str) -> str: if text is None: text = "" class_name = f"{category_name}-{text.lower()}" return f'
{text}
' def __call__(self, df: pd.DataFrame) -> pd.DataFrame: df = df.sort_values("arxiv_id", ascending=False).reset_index(drop=True) new_rows = [] for _, row in df.iterrows(): new_row = dict(row) | { "date": Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}"), "paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page), "github": self.get_github_link(row.github), } new_rows.append(new_row) return pd.DataFrame(new_rows, columns=df.columns) class PaperList: COLUMN_INFO = [ ["date", "markdown"], ["paper_page", "markdown"], ["title", "str"], ["github", "markdown"], ["upvotes", "number"], ] def __init__(self, df: pd.DataFrame): self.df_raw = df self._prettifier = Prettifier() self.df_prettified = self._prettifier(df).loc[:, self.column_names] @property def column_names(self): return list(map(operator.itemgetter(0), self.COLUMN_INFO)) @property def column_datatype(self): return list(map(operator.itemgetter(1), self.COLUMN_INFO)) def search(self, start_date: datetime.datetime, end_date: datetime.datetime, title: str) -> pd.DataFrame: df = self.df_raw.copy() df["date"] = pd.to_datetime(df["date"]) # Filter by date df = df[(df["date"] >= start_date) & (df["date"] <= end_date)] df["date"] = df["date"].dt.strftime("%Y-%m-%d") # Filter by title df = df[df["title"].str.contains(title, case=False)] df_prettified = self._prettifier(df).loc[:, self.column_names] return df_prettified