hysts HF staff commited on
Commit
5884212
1 Parent(s): d2cd00e
Files changed (6) hide show
  1. app.py +4 -9
  2. last_updated.txt +0 -0
  3. papers.py +4 -3
  4. requirements.txt +0 -1
  5. style.css +9 -1
  6. update_scheduler.py +0 -130
app.py CHANGED
@@ -1,23 +1,17 @@
1
  #!/usr/bin/env python
2
 
3
- import os
4
-
5
  import gradio as gr
6
  import pandas as pd
7
  from gradio_calendar import Calendar
8
 
9
  from papers import PaperList, get_df
10
- from update_scheduler import UpdateScheduler
11
 
12
  DESCRIPTION = "# [Daily Papers](https://huggingface.co/papers)"
13
 
14
- paper_list = PaperList(get_df("papers.csv"))
 
15
 
16
- if (SPACE_ID := os.getenv("SPACE_ID")) is not None:
17
- CRON_HOUR = os.getenv("CRON_HOUR", "*/4")
18
- CRON_MINUTE = os.getenv("CRON_MINUTE", "0")
19
- scheduler = UpdateScheduler(space_id=SPACE_ID, cron_hour=CRON_HOUR, cron_minute=CRON_MINUTE)
20
- scheduler.start()
21
 
22
 
23
  def update_num_papers(df: pd.DataFrame) -> str:
@@ -43,6 +37,7 @@ with gr.Blocks(css="style.css") as demo:
43
  column_widths=["10%", "10%", "60%", "10%", "10%"],
44
  wrap=True,
45
  )
 
46
 
47
  gr.on(
48
  triggers=[start_date.change, end_date.change, search_title.submit],
 
1
  #!/usr/bin/env python
2
 
 
 
3
  import gradio as gr
4
  import pandas as pd
5
  from gradio_calendar import Calendar
6
 
7
  from papers import PaperList, get_df
 
8
 
9
  DESCRIPTION = "# [Daily Papers](https://huggingface.co/papers)"
10
 
11
+ with open("last_updated.txt") as f:
12
+ last_updated = f.read()
13
 
14
+ paper_list = PaperList(get_df())
 
 
 
 
15
 
16
 
17
  def update_num_papers(df: pd.DataFrame) -> str:
 
37
  column_widths=["10%", "10%", "60%", "10%", "10%"],
38
  wrap=True,
39
  )
40
+ gr.Markdown(value=f"Last updated: {last_updated}", elem_id="last-updated")
41
 
42
  gr.on(
43
  triggers=[start_date.change, end_date.change, search_title.submit],
last_updated.txt ADDED
File without changes
papers.py CHANGED
@@ -1,8 +1,8 @@
1
  import dataclasses
2
  import datetime
3
  import operator
4
- import pathlib
5
 
 
6
  import pandas as pd
7
  import requests
8
  import tqdm.auto
@@ -29,8 +29,9 @@ class PaperInfo:
29
  return timestamp
30
 
31
 
32
- def get_df(path: pathlib.Path | str) -> pd.DataFrame:
33
- df = pd.read_csv(path, dtype=str).fillna("")
 
34
  paper_info = []
35
  for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
36
  res = requests.get(f"https://huggingface.co/api/papers/{row.arxiv_id}").json()
 
1
  import dataclasses
2
  import datetime
3
  import operator
 
4
 
5
+ import datasets
6
  import pandas as pd
7
  import requests
8
  import tqdm.auto
 
29
  return timestamp
30
 
31
 
32
+ def get_df() -> pd.DataFrame:
33
+ df = datasets.load_dataset("hysts-internal/daily-papers")["train"].to_pandas()
34
+ df = df.drop(columns=["title"])
35
  paper_info = []
36
  for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)):
37
  res = requests.get(f"https://huggingface.co/api/papers/{row.arxiv_id}").json()
requirements.txt CHANGED
@@ -1,4 +1,3 @@
1
- apscheduler==3.10.4
2
  gradio==4.21.0
3
  gradio_calendar==0.0.4
4
  huggingface_hub==0.21.4
 
 
1
  gradio==4.21.0
2
  gradio_calendar==0.0.4
3
  huggingface_hub==0.21.4
style.css CHANGED
@@ -4,7 +4,7 @@ h1 {
4
  }
5
 
6
  body a,
7
- #component-0 a,
8
  #table a {
9
  background-color: transparent;
10
  color: #58a6ff;
@@ -19,3 +19,11 @@ body a:hover {
19
  body a:hover {
20
  text-decoration: underline;
21
  }
 
 
 
 
 
 
 
 
 
4
  }
5
 
6
  body a,
7
+ .contain a,
8
  #table a {
9
  background-color: transparent;
10
  color: #58a6ff;
 
19
  body a:hover {
20
  text-decoration: underline;
21
  }
22
+
23
+ #last-updated {
24
+ display: block;
25
+ text-align: center;
26
+ margin-top: 1em;
27
+ font-size: 0.8em;
28
+ color: #666;
29
+ }
update_scheduler.py DELETED
@@ -1,130 +0,0 @@
1
- import datetime
2
- import pathlib
3
- import re
4
- import tempfile
5
-
6
- import pandas as pd
7
- import requests
8
- from apscheduler.schedulers.background import BackgroundScheduler
9
- from huggingface_hub import HfApi, Repository
10
- from huggingface_hub.utils import RepositoryNotFoundError
11
-
12
-
13
- class SpaceRestarter:
14
- def __init__(self, space_id: str):
15
- self.api = HfApi()
16
- if self.api.get_token_permission() != "write":
17
- raise ValueError("The HF token must have write permission.")
18
- try:
19
- self.api.space_info(repo_id=space_id)
20
- except RepositoryNotFoundError:
21
- raise ValueError("The Space ID does not exist.")
22
- self.space_id = space_id
23
-
24
- def restart(self) -> None:
25
- self.api.restart_space(self.space_id)
26
-
27
-
28
- def find_github_links(summary: str) -> str:
29
- links = re.findall(r"https://github.com/[^/]+/[^/)}, ]+(?:/(?:tree|blob)/[^/]+/[^/)}, ]+)?", summary)
30
- if len(links) == 0:
31
- return ""
32
- if len(links) != 1:
33
- raise RuntimeError(f"Found multiple GitHub links: {links}")
34
- link = links[0]
35
- if link.endswith("."):
36
- link = link[:-1]
37
- link = link.strip()
38
- return link
39
-
40
-
41
- class RepoUpdater:
42
- def __init__(self, repo_id: str, repo_type: str):
43
- api = HfApi()
44
- if api.get_token_permission() != "write":
45
- raise ValueError("The HF token must have write permission.")
46
-
47
- name = api.whoami()["name"]
48
-
49
- repo_dir = pathlib.Path(tempfile.tempdir) / repo_id.split("/")[-1] # type: ignore
50
- self.csv_path = repo_dir / "papers.csv"
51
- self.repo = Repository(
52
- local_dir=repo_dir,
53
- clone_from=repo_id,
54
- repo_type=repo_type,
55
- git_user=name,
56
- git_email=f"{name}@users.noreply.huggingface.co",
57
- )
58
- self.repo.git_pull()
59
-
60
- def update(self) -> None:
61
- yesterday = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d")
62
- today = datetime.datetime.now().strftime("%Y-%m-%d")
63
- daily_papers = [
64
- {
65
- "date": yesterday,
66
- "papers": requests.get(f"https://huggingface.co/api/daily_papers?date={yesterday}").json(),
67
- },
68
- {
69
- "date": today,
70
- "papers": requests.get(f"https://huggingface.co/api/daily_papers?date={today}").json(),
71
- },
72
- ]
73
-
74
- self.repo.git_pull()
75
- df = pd.read_csv(self.csv_path, dtype=str).fillna("")
76
- rows = [row for _, row in df.iterrows()]
77
- arxiv_ids = {row.arxiv_id for row in rows}
78
-
79
- for d in daily_papers:
80
- date = d["date"]
81
- papers = d["papers"]
82
- for paper in papers:
83
- arxiv_id = paper["paper"]["id"]
84
- if arxiv_id in arxiv_ids:
85
- continue
86
- try:
87
- github = find_github_links(paper["paper"]["summary"])
88
- except RuntimeError as e:
89
- print(e)
90
- continue
91
- rows.append(
92
- pd.Series(
93
- {
94
- "date": date,
95
- "arxiv_id": arxiv_id,
96
- "github": github,
97
- }
98
- )
99
- )
100
- df = pd.DataFrame(rows).reset_index(drop=True)
101
- df.to_csv(self.csv_path, index=False)
102
-
103
- def push(self) -> None:
104
- self.repo.push_to_hub()
105
-
106
-
107
- class UpdateScheduler:
108
- def __init__(self, space_id: str, cron_hour: str, cron_minute: str, cron_second: str = "0"):
109
- self.space_restarter = SpaceRestarter(space_id=space_id)
110
- self.repo_updater = RepoUpdater(repo_id=space_id, repo_type="space")
111
-
112
- self.scheduler = BackgroundScheduler()
113
- self.scheduler.add_job(
114
- func=self._update,
115
- trigger="cron",
116
- hour=cron_hour,
117
- minute=cron_minute,
118
- second=cron_second,
119
- timezone="UTC",
120
- )
121
-
122
- def _update(self) -> None:
123
- self.repo_updater.update()
124
- if self.repo_updater.repo.is_repo_clean():
125
- self.space_restarter.restart()
126
- else:
127
- self.repo_updater.push()
128
-
129
- def start(self) -> None:
130
- self.scheduler.start()