hysts HF staff commited on
Commit
928c784
1 Parent(s): db967a1

Use apscheduler to update the index file

Browse files
Files changed (4) hide show
  1. papers.py +23 -4
  2. pyproject.toml +1 -0
  3. requirements.txt +10 -2
  4. uv.lock +28 -0
papers.py CHANGED
@@ -4,20 +4,39 @@ import operator
4
  import datasets
5
  import pandas as pd
6
  import tqdm.auto
 
7
  from huggingface_hub import HfApi
8
  from ragatouille import RAGPretrainedModel
9
 
10
  api = HfApi()
11
 
 
12
  INDEX_DIR_PATH = ".ragatouille/colbert/indexes/daily-papers-abstract-index/"
13
  api.snapshot_download(
14
- repo_id="hysts-bot-data/daily-papers-abstract-index",
15
  repo_type="dataset",
16
  local_dir=INDEX_DIR_PATH,
17
  )
18
- ABSTRACT_RETRIEVER = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
19
  # Run once to initialize the retriever
20
- ABSTRACT_RETRIEVER.search("LLM")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
 
23
  def get_df() -> pd.DataFrame:
@@ -114,7 +133,7 @@ class PaperList:
114
 
115
  # Filter by abstract
116
  if abstract_search_query:
117
- results = ABSTRACT_RETRIEVER.search(abstract_search_query, k=max_num_to_retrieve)
118
  remaining_ids = set(df["arxiv_id"])
119
  found_id_set = set()
120
  found_ids = []
 
4
  import datasets
5
  import pandas as pd
6
  import tqdm.auto
7
+ from apscheduler.schedulers.background import BackgroundScheduler
8
  from huggingface_hub import HfApi
9
  from ragatouille import RAGPretrainedModel
10
 
11
  api = HfApi()
12
 
13
+ INDEX_REPO_ID = "hysts-bot-data/daily-papers-abstract-index"
14
  INDEX_DIR_PATH = ".ragatouille/colbert/indexes/daily-papers-abstract-index/"
15
  api.snapshot_download(
16
+ repo_id=INDEX_REPO_ID,
17
  repo_type="dataset",
18
  local_dir=INDEX_DIR_PATH,
19
  )
20
+ abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
21
  # Run once to initialize the retriever
22
+ abstract_retriever.search("LLM")
23
+
24
+
25
+ def update_abstract_index() -> None:
26
+ global abstract_retriever
27
+
28
+ api.snapshot_download(
29
+ repo_id=INDEX_REPO_ID,
30
+ repo_type="dataset",
31
+ local_dir=INDEX_DIR_PATH,
32
+ )
33
+ abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
34
+ abstract_retriever.search("LLM")
35
+
36
+
37
+ scheduler = BackgroundScheduler()
38
+ scheduler.add_job(func=update_abstract_index, trigger="cron", hour="*", timezone="UTC", misfire_grace_time=3 * 60)
39
+ scheduler.start()
40
 
41
 
42
  def get_df() -> pd.DataFrame:
 
133
 
134
  # Filter by abstract
135
  if abstract_search_query:
136
+ results = abstract_retriever.search(abstract_search_query, k=max_num_to_retrieve)
137
  remaining_ids = set(df["arxiv_id"])
138
  found_id_set = set()
139
  found_ids = []
pyproject.toml CHANGED
@@ -5,6 +5,7 @@ description = ""
5
  readme = "README.md"
6
  requires-python = ">=3.10"
7
  dependencies = [
 
8
  "datasets>=2.21.0",
9
  "gradio-calendar>=0.0.4",
10
  "gradio>=4.44.0",
 
5
  readme = "README.md"
6
  requires-python = ">=3.10"
7
  dependencies = [
8
+ "apscheduler>=3.10.4",
9
  "datasets>=2.21.0",
10
  "gradio-calendar>=0.0.4",
11
  "gradio>=4.44.0",
requirements.txt CHANGED
@@ -21,6 +21,8 @@ anyio==4.4.0
21
  # httpx
22
  # openai
23
  # starlette
 
 
24
  async-timeout==4.0.3
25
  # via
26
  # aiohttp
@@ -406,7 +408,9 @@ python-dotenv==1.0.1
406
  python-multipart==0.0.9
407
  # via gradio
408
  pytz==2024.2
409
- # via pandas
 
 
410
  pyyaml==6.0.2
411
  # via
412
  # datasets
@@ -455,7 +459,9 @@ setuptools==74.1.2
455
  shellingham==1.5.4
456
  # via typer
457
  six==1.16.0
458
- # via python-dateutil
 
 
459
  smmap==5.0.1
460
  # via gitdb
461
  sniffio==1.3.1
@@ -546,6 +552,8 @@ typing-inspect==0.9.0
546
  # llama-index-legacy
547
  tzdata==2024.1
548
  # via pandas
 
 
549
  ujson==5.10.0
550
  # via colbert-ai
551
  urllib3==2.2.2
 
21
  # httpx
22
  # openai
23
  # starlette
24
+ apscheduler==3.10.4
25
+ # via daily-papers (pyproject.toml)
26
  async-timeout==4.0.3
27
  # via
28
  # aiohttp
 
408
  python-multipart==0.0.9
409
  # via gradio
410
  pytz==2024.2
411
+ # via
412
+ # apscheduler
413
+ # pandas
414
  pyyaml==6.0.2
415
  # via
416
  # datasets
 
459
  shellingham==1.5.4
460
  # via typer
461
  six==1.16.0
462
+ # via
463
+ # apscheduler
464
+ # python-dateutil
465
  smmap==5.0.1
466
  # via gitdb
467
  sniffio==1.3.1
 
552
  # llama-index-legacy
553
  tzdata==2024.1
554
  # via pandas
555
+ tzlocal==5.2
556
+ # via apscheduler
557
  ujson==5.10.0
558
  # via colbert-ai
559
  urllib3==2.2.2
uv.lock CHANGED
@@ -139,6 +139,20 @@ wheels = [
139
  { url = "https://files.pythonhosted.org/packages/7b/a2/10639a79341f6c019dedc95bd48a4928eed9f1d1197f4c04f546fc7ae0ff/anyio-4.4.0-py3-none-any.whl", hash = "sha256:c1b2d8f46a8a812513012e1107cb0e68c17159a7a594208005a57dc776e1bdc7", size = 86780 },
140
  ]
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  [[package]]
143
  name = "async-timeout"
144
  version = "4.0.3"
@@ -423,6 +437,7 @@ name = "daily-papers"
423
  version = "0.1.0"
424
  source = { virtual = "." }
425
  dependencies = [
 
426
  { name = "datasets" },
427
  { name = "gradio" },
428
  { name = "gradio-calendar" },
@@ -436,6 +451,7 @@ dependencies = [
436
 
437
  [package.metadata]
438
  requires-dist = [
 
439
  { name = "datasets", specifier = ">=2.21.0" },
440
  { name = "gradio", specifier = ">=4.44.0" },
441
  { name = "gradio-calendar", specifier = ">=0.0.4" },
@@ -3032,6 +3048,18 @@ wheels = [
3032
  { url = "https://files.pythonhosted.org/packages/65/58/f9c9e6be752e9fcb8b6a0ee9fb87e6e7a1f6bcab2cdc73f02bb7ba91ada0/tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252", size = 345370 },
3033
  ]
3034
 
 
 
 
 
 
 
 
 
 
 
 
 
3035
  [[package]]
3036
  name = "ujson"
3037
  version = "5.10.0"
 
139
  { url = "https://files.pythonhosted.org/packages/7b/a2/10639a79341f6c019dedc95bd48a4928eed9f1d1197f4c04f546fc7ae0ff/anyio-4.4.0-py3-none-any.whl", hash = "sha256:c1b2d8f46a8a812513012e1107cb0e68c17159a7a594208005a57dc776e1bdc7", size = 86780 },
140
  ]
141
 
142
+ [[package]]
143
+ name = "apscheduler"
144
+ version = "3.10.4"
145
+ source = { registry = "https://pypi.org/simple" }
146
+ dependencies = [
147
+ { name = "pytz" },
148
+ { name = "six" },
149
+ { name = "tzlocal" },
150
+ ]
151
+ sdist = { url = "https://files.pythonhosted.org/packages/5e/34/5dcb368cf89f93132d9a31bd3747962a9dc874480e54333b0c09fa7d56ac/APScheduler-3.10.4.tar.gz", hash = "sha256:e6df071b27d9be898e486bc7940a7be50b4af2e9da7c08f0744a96d4bd4cef4a", size = 100832 }
152
+ wheels = [
153
+ { url = "https://files.pythonhosted.org/packages/13/b5/7af0cb920a476dccd612fbc9a21a3745fb29b1fcd74636078db8f7ba294c/APScheduler-3.10.4-py3-none-any.whl", hash = "sha256:fb91e8a768632a4756a585f79ec834e0e27aad5860bac7eaa523d9ccefd87661", size = 59303 },
154
+ ]
155
+
156
  [[package]]
157
  name = "async-timeout"
158
  version = "4.0.3"
 
437
  version = "0.1.0"
438
  source = { virtual = "." }
439
  dependencies = [
440
+ { name = "apscheduler" },
441
  { name = "datasets" },
442
  { name = "gradio" },
443
  { name = "gradio-calendar" },
 
451
 
452
  [package.metadata]
453
  requires-dist = [
454
+ { name = "apscheduler", specifier = ">=3.10.4" },
455
  { name = "datasets", specifier = ">=2.21.0" },
456
  { name = "gradio", specifier = ">=4.44.0" },
457
  { name = "gradio-calendar", specifier = ">=0.0.4" },
 
3048
  { url = "https://files.pythonhosted.org/packages/65/58/f9c9e6be752e9fcb8b6a0ee9fb87e6e7a1f6bcab2cdc73f02bb7ba91ada0/tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252", size = 345370 },
3049
  ]
3050
 
3051
+ [[package]]
3052
+ name = "tzlocal"
3053
+ version = "5.2"
3054
+ source = { registry = "https://pypi.org/simple" }
3055
+ dependencies = [
3056
+ { name = "tzdata", marker = "platform_system == 'Windows'" },
3057
+ ]
3058
+ sdist = { url = "https://files.pythonhosted.org/packages/04/d3/c19d65ae67636fe63953b20c2e4a8ced4497ea232c43ff8d01db16de8dc0/tzlocal-5.2.tar.gz", hash = "sha256:8d399205578f1a9342816409cc1e46a93ebd5755e39ea2d85334bea911bf0e6e", size = 30201 }
3059
+ wheels = [
3060
+ { url = "https://files.pythonhosted.org/packages/97/3f/c4c51c55ff8487f2e6d0e618dba917e3c3ee2caae6cf0fbb59c9b1876f2e/tzlocal-5.2-py3-none-any.whl", hash = "sha256:49816ef2fe65ea8ac19d19aa7a1ae0551c834303d5014c6d5a62e4cbda8047b8", size = 17859 },
3061
+ ]
3062
+
3063
  [[package]]
3064
  name = "ujson"
3065
  version = "5.10.0"