Spaces:

hysts
/

daily-papers

Running on CPU Upgrade

App Files Files Community

hysts HF staff commited on Mar 11

Commit

5884212

•

1 Parent(s): d2cd00e

Update

Browse files

Files changed (6) hide show

app.py +4 -9
last_updated.txt +0 -0
papers.py +4 -3
requirements.txt +0 -1
style.css +9 -1
update_scheduler.py +0 -130

app.py CHANGED Viewed

@@ -1,23 +1,17 @@
 #!/usr/bin/env python
-import os
 import gradio as gr
 import pandas as pd
 from gradio_calendar import Calendar
 from papers import PaperList, get_df
-from update_scheduler import UpdateScheduler
 DESCRIPTION = "# [Daily Papers](https://huggingface.co/papers)"
-paper_list = PaperList(get_df("papers.csv"))
-if (SPACE_ID := os.getenv("SPACE_ID")) is not None:
-    CRON_HOUR = os.getenv("CRON_HOUR", "*/4")
-    CRON_MINUTE = os.getenv("CRON_MINUTE", "0")
-    scheduler = UpdateScheduler(space_id=SPACE_ID, cron_hour=CRON_HOUR, cron_minute=CRON_MINUTE)
-    scheduler.start()
 def update_num_papers(df: pd.DataFrame) -> str:
@@ -43,6 +37,7 @@ with gr.Blocks(css="style.css") as demo:
         column_widths=["10%", "10%", "60%", "10%", "10%"],
         wrap=True,
     )
     gr.on(
         triggers=[start_date.change, end_date.change, search_title.submit],

 #!/usr/bin/env python
 import gradio as gr
 import pandas as pd
 from gradio_calendar import Calendar
 from papers import PaperList, get_df
 DESCRIPTION = "# [Daily Papers](https://huggingface.co/papers)"
+with open("last_updated.txt") as f:
+    last_updated = f.read()
+paper_list = PaperList(get_df())
 def update_num_papers(df: pd.DataFrame) -> str:
         column_widths=["10%", "10%", "60%", "10%", "10%"],
         wrap=True,
     )
+    gr.Markdown(value=f"Last updated: {last_updated}", elem_id="last-updated")
     gr.on(
         triggers=[start_date.change, end_date.change, search_title.submit],

last_updated.txt ADDED Viewed

File without changes

papers.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import dataclasses
 import datetime
 import operator
-import pathlib
 import pandas as pd
 import requests
 import tqdm.auto
@@ -29,8 +29,9 @@ class PaperInfo:
             return timestamp
-def get_df(path: pathlib.Path | str) -> pd.DataFrame:
-    df = pd.read_csv(path, dtype=str).fillna("")
     paper_info = []
     for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
         res = requests.get(f"https://huggingface.co/api/papers/{row.arxiv_id}").json()

 import dataclasses
 import datetime
 import operator
+import datasets
 import pandas as pd
 import requests
 import tqdm.auto
             return timestamp
+def get_df() -> pd.DataFrame:
+    df = datasets.load_dataset("hysts-internal/daily-papers")["train"].to_pandas()
+    df = df.drop(columns=["title"])
     paper_info = []
     for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
         res = requests.get(f"https://huggingface.co/api/papers/{row.arxiv_id}").json()

requirements.txt CHANGED Viewed

@@ -1,4 +1,3 @@
-apscheduler==3.10.4
 gradio==4.21.0
 gradio_calendar==0.0.4
 huggingface_hub==0.21.4

 gradio==4.21.0
 gradio_calendar==0.0.4
 huggingface_hub==0.21.4

style.css CHANGED Viewed

@@ -4,7 +4,7 @@ h1 {
 }
 body a,
-#component-0 a,
 #table a {
   background-color: transparent;
   color: #58a6ff;
@@ -19,3 +19,11 @@ body a:hover {
 body a:hover {
   text-decoration: underline;
 }

 }
 body a,
+.contain a,
 #table a {
   background-color: transparent;
   color: #58a6ff;
 body a:hover {
   text-decoration: underline;
 }
+#last-updated {
+  display: block;
+  text-align: center;
+  margin-top: 1em;
+  font-size: 0.8em;
+  color: #666;
+}

update_scheduler.py DELETED Viewed

@@ -1,130 +0,0 @@
-import datetime
-import pathlib
-import re
-import tempfile
-import pandas as pd
-import requests
-from apscheduler.schedulers.background import BackgroundScheduler
-from huggingface_hub import HfApi, Repository
-from huggingface_hub.utils import RepositoryNotFoundError
-class SpaceRestarter:
-    def __init__(self, space_id: str):
-        self.api = HfApi()
-        if self.api.get_token_permission() != "write":
-            raise ValueError("The HF token must have write permission.")
-        try:
-            self.api.space_info(repo_id=space_id)
-        except RepositoryNotFoundError:
-            raise ValueError("The Space ID does not exist.")
-        self.space_id = space_id
-    def restart(self) -> None:
-        self.api.restart_space(self.space_id)
-def find_github_links(summary: str) -> str:
-    links = re.findall(r"https://github.com/[^/]+/[^/)}, ]+(?:/(?:tree|blob)/[^/]+/[^/)}, ]+)?", summary)
-    if len(links) == 0:
-        return ""
-    if len(links) != 1:
-        raise RuntimeError(f"Found multiple GitHub links: {links}")
-    link = links[0]
-    if link.endswith("."):
-        link = link[:-1]
-    link = link.strip()
-    return link
-class RepoUpdater:
-    def __init__(self, repo_id: str, repo_type: str):
-        api = HfApi()
-        if api.get_token_permission() != "write":
-            raise ValueError("The HF token must have write permission.")
-        name = api.whoami()["name"]
-        repo_dir = pathlib.Path(tempfile.tempdir) / repo_id.split("/")[-1]  # type: ignore
-        self.csv_path = repo_dir / "papers.csv"
-        self.repo = Repository(
-            local_dir=repo_dir,
-            clone_from=repo_id,
-            repo_type=repo_type,
-            git_user=name,
-            git_email=f"{name}@users.noreply.huggingface.co",
-        )
-        self.repo.git_pull()
-    def update(self) -> None:
-        yesterday = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")
-        today = datetime.datetime.now().strftime("%Y-%m-%d")
-        daily_papers = [
-            {
-                "date": yesterday,
-                "papers": requests.get(f"https://huggingface.co/api/daily_papers?date={yesterday}").json(),
-            },
-            {
-                "date": today,
-                "papers": requests.get(f"https://huggingface.co/api/daily_papers?date={today}").json(),
-            },
-        ]
-        self.repo.git_pull()
-        df = pd.read_csv(self.csv_path, dtype=str).fillna("")
-        rows = [row for _, row in df.iterrows()]
-        arxiv_ids = {row.arxiv_id for row in rows}
-        for d in daily_papers:
-            date = d["date"]
-            papers = d["papers"]
-            for paper in papers:
-                arxiv_id = paper["paper"]["id"]
-                if arxiv_id in arxiv_ids:
-                    continue
-                try:
-                    github = find_github_links(paper["paper"]["summary"])
-                except RuntimeError as e:
-                    print(e)
-                    continue
-                rows.append(
-                    pd.Series(
-                        {
-                            "date": date,
-                            "arxiv_id": arxiv_id,
-                            "github": github,
-                        }
-                    )
-                )
-        df = pd.DataFrame(rows).reset_index(drop=True)
-        df.to_csv(self.csv_path, index=False)
-    def push(self) -> None:
-        self.repo.push_to_hub()
-class UpdateScheduler:
-    def __init__(self, space_id: str, cron_hour: str, cron_minute: str, cron_second: str = "0"):
-        self.space_restarter = SpaceRestarter(space_id=space_id)
-        self.repo_updater = RepoUpdater(repo_id=space_id, repo_type="space")
-        self.scheduler = BackgroundScheduler()
-        self.scheduler.add_job(
-            func=self._update,
-            trigger="cron",
-            hour=cron_hour,
-            minute=cron_minute,
-            second=cron_second,
-            timezone="UTC",
-        )
-    def _update(self) -> None:
-        self.repo_updater.update()
-        if self.repo_updater.repo.is_repo_clean():
-            self.space_restarter.restart()
-        else:
-            self.repo_updater.push()
-    def start(self) -> None:
-        self.scheduler.start()