Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 4,922 Bytes
08080f2 5884212 08080f2 928c784 d940698 928c784 d940698 928c784 d940698 928c784 d940698 928c784 08080f2 5884212 5c4c264 dbe9d55 5c4c264 b777227 44ec0fb e332775 b777227 08080f2 db967a1 08080f2 db967a1 08080f2 9fb4b90 08080f2 e2797b8 08080f2 9fb4b90 08080f2 28b01c9 9fb4b90 28b01c9 9fb4b90 28b01c9 08080f2 28b01c9 08080f2 9fb4b90 28b01c9 08080f2 4a02364 5c4c264 d940698 5c4c264 4a02364 57102fb 4a02364 57102fb 5c4c264 57102fb d940698 928c784 d940698 4a02364 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import datetime
import operator
import datasets
import pandas as pd
import tqdm.auto
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import HfApi
from ragatouille import RAGPretrainedModel
api = HfApi()
INDEX_REPO_ID = "hysts-bot-data/daily-papers-abstract-index"
INDEX_DIR_PATH = ".ragatouille/colbert/indexes/daily-papers-abstract-index/"
api.snapshot_download(
repo_id=INDEX_REPO_ID,
repo_type="dataset",
local_dir=INDEX_DIR_PATH,
)
abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
# Run once to initialize the retriever
abstract_retriever.search("LLM")
def update_abstract_index() -> None:
global abstract_retriever
api.snapshot_download(
repo_id=INDEX_REPO_ID,
repo_type="dataset",
local_dir=INDEX_DIR_PATH,
)
abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
abstract_retriever.search("LLM")
scheduler = BackgroundScheduler()
scheduler.add_job(func=update_abstract_index, trigger="cron", hour="*", timezone="UTC", misfire_grace_time=3 * 60)
scheduler.start()
def get_df() -> pd.DataFrame:
df = pd.merge(
left=datasets.load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas(),
right=datasets.load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas(),
on="arxiv_id",
)
df = df[::-1].reset_index(drop=True)
df["date"] = df["date"].dt.strftime("%Y-%m-%d")
df = df.drop(columns=["authors", "abstract"])
paper_info = []
for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
info = row.copy()
info["paper_page"] = f"https://huggingface.co/papers/{row.arxiv_id}"
paper_info.append(info)
return pd.DataFrame(paper_info)
class Prettifier:
@staticmethod
def get_github_link(link: str) -> str:
if not link:
return ""
return Prettifier.create_link("github", link)
@staticmethod
def create_link(text: str, url: str) -> str:
return f'<a href="{url}" target="_blank">{text}</a>'
@staticmethod
def to_div(text: str | None, category_name: str) -> str:
if text is None:
text = ""
class_name = f"{category_name}-{text.lower()}"
return f'<div class="{class_name}">{text}</div>'
def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
new_rows = []
for _, row in df.iterrows():
new_row = {
"date": Prettifier.create_link(row.date, f"https://huggingface.co/papers?date={row.date}"),
"paper_page": Prettifier.create_link(row.arxiv_id, row.paper_page),
"title": row["title"],
"github": self.get_github_link(row.github),
"π": row["upvotes"],
"π¬": row["num_comments"],
}
new_rows.append(new_row)
return pd.DataFrame(new_rows)
class PaperList:
COLUMN_INFO = [
["date", "markdown"],
["paper_page", "markdown"],
["title", "str"],
["github", "markdown"],
["π", "number"],
["π¬", "number"],
]
def __init__(self, df: pd.DataFrame):
self.df_raw = df
self._prettifier = Prettifier()
self.df_prettified = self._prettifier(df).loc[:, self.column_names]
@property
def column_names(self):
return list(map(operator.itemgetter(0), self.COLUMN_INFO))
@property
def column_datatype(self):
return list(map(operator.itemgetter(1), self.COLUMN_INFO))
def search(
self,
start_date: datetime.datetime,
end_date: datetime.datetime,
title_search_query: str,
abstract_search_query: str,
max_num_to_retrieve: int,
) -> pd.DataFrame:
df = self.df_raw.copy()
df["date"] = pd.to_datetime(df["date"])
# Filter by date
df = df[(df["date"] >= start_date) & (df["date"] <= end_date)]
df["date"] = df["date"].dt.strftime("%Y-%m-%d")
# Filter by title
df = df[df["title"].str.contains(title_search_query, case=False)]
# Filter by abstract
if abstract_search_query:
results = abstract_retriever.search(abstract_search_query, k=max_num_to_retrieve)
remaining_ids = set(df["arxiv_id"])
found_id_set = set()
found_ids = []
for x in results:
arxiv_id = x["document_id"]
if arxiv_id not in remaining_ids:
continue
if arxiv_id in found_id_set:
continue
found_id_set.add(arxiv_id)
found_ids.append(arxiv_id)
df = df[df["arxiv_id"].isin(found_ids)].set_index("arxiv_id").reindex(index=found_ids).reset_index()
df_prettified = self._prettifier(df).loc[:, self.column_names]
return df_prettified
|