Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Use apscheduler to update the index file
Browse files- papers.py +23 -4
- pyproject.toml +1 -0
- requirements.txt +10 -2
- uv.lock +28 -0
papers.py
CHANGED
@@ -4,20 +4,39 @@ import operator
|
|
4 |
import datasets
|
5 |
import pandas as pd
|
6 |
import tqdm.auto
|
|
|
7 |
from huggingface_hub import HfApi
|
8 |
from ragatouille import RAGPretrainedModel
|
9 |
|
10 |
api = HfApi()
|
11 |
|
|
|
12 |
INDEX_DIR_PATH = ".ragatouille/colbert/indexes/daily-papers-abstract-index/"
|
13 |
api.snapshot_download(
|
14 |
-
repo_id=
|
15 |
repo_type="dataset",
|
16 |
local_dir=INDEX_DIR_PATH,
|
17 |
)
|
18 |
-
|
19 |
# Run once to initialize the retriever
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
|
23 |
def get_df() -> pd.DataFrame:
|
@@ -114,7 +133,7 @@ class PaperList:
|
|
114 |
|
115 |
# Filter by abstract
|
116 |
if abstract_search_query:
|
117 |
-
results =
|
118 |
remaining_ids = set(df["arxiv_id"])
|
119 |
found_id_set = set()
|
120 |
found_ids = []
|
|
|
4 |
import datasets
|
5 |
import pandas as pd
|
6 |
import tqdm.auto
|
7 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
8 |
from huggingface_hub import HfApi
|
9 |
from ragatouille import RAGPretrainedModel
|
10 |
|
11 |
api = HfApi()
|
12 |
|
13 |
+
INDEX_REPO_ID = "hysts-bot-data/daily-papers-abstract-index"
|
14 |
INDEX_DIR_PATH = ".ragatouille/colbert/indexes/daily-papers-abstract-index/"
|
15 |
api.snapshot_download(
|
16 |
+
repo_id=INDEX_REPO_ID,
|
17 |
repo_type="dataset",
|
18 |
local_dir=INDEX_DIR_PATH,
|
19 |
)
|
20 |
+
abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
|
21 |
# Run once to initialize the retriever
|
22 |
+
abstract_retriever.search("LLM")
|
23 |
+
|
24 |
+
|
25 |
+
def update_abstract_index() -> None:
|
26 |
+
global abstract_retriever
|
27 |
+
|
28 |
+
api.snapshot_download(
|
29 |
+
repo_id=INDEX_REPO_ID,
|
30 |
+
repo_type="dataset",
|
31 |
+
local_dir=INDEX_DIR_PATH,
|
32 |
+
)
|
33 |
+
abstract_retriever = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
|
34 |
+
abstract_retriever.search("LLM")
|
35 |
+
|
36 |
+
|
37 |
+
scheduler = BackgroundScheduler()
|
38 |
+
scheduler.add_job(func=update_abstract_index, trigger="cron", hour="*", timezone="UTC", misfire_grace_time=3 * 60)
|
39 |
+
scheduler.start()
|
40 |
|
41 |
|
42 |
def get_df() -> pd.DataFrame:
|
|
|
133 |
|
134 |
# Filter by abstract
|
135 |
if abstract_search_query:
|
136 |
+
results = abstract_retriever.search(abstract_search_query, k=max_num_to_retrieve)
|
137 |
remaining_ids = set(df["arxiv_id"])
|
138 |
found_id_set = set()
|
139 |
found_ids = []
|
pyproject.toml
CHANGED
@@ -5,6 +5,7 @@ description = ""
|
|
5 |
readme = "README.md"
|
6 |
requires-python = ">=3.10"
|
7 |
dependencies = [
|
|
|
8 |
"datasets>=2.21.0",
|
9 |
"gradio-calendar>=0.0.4",
|
10 |
"gradio>=4.44.0",
|
|
|
5 |
readme = "README.md"
|
6 |
requires-python = ">=3.10"
|
7 |
dependencies = [
|
8 |
+
"apscheduler>=3.10.4",
|
9 |
"datasets>=2.21.0",
|
10 |
"gradio-calendar>=0.0.4",
|
11 |
"gradio>=4.44.0",
|
requirements.txt
CHANGED
@@ -21,6 +21,8 @@ anyio==4.4.0
|
|
21 |
# httpx
|
22 |
# openai
|
23 |
# starlette
|
|
|
|
|
24 |
async-timeout==4.0.3
|
25 |
# via
|
26 |
# aiohttp
|
@@ -406,7 +408,9 @@ python-dotenv==1.0.1
|
|
406 |
python-multipart==0.0.9
|
407 |
# via gradio
|
408 |
pytz==2024.2
|
409 |
-
# via
|
|
|
|
|
410 |
pyyaml==6.0.2
|
411 |
# via
|
412 |
# datasets
|
@@ -455,7 +459,9 @@ setuptools==74.1.2
|
|
455 |
shellingham==1.5.4
|
456 |
# via typer
|
457 |
six==1.16.0
|
458 |
-
# via
|
|
|
|
|
459 |
smmap==5.0.1
|
460 |
# via gitdb
|
461 |
sniffio==1.3.1
|
@@ -546,6 +552,8 @@ typing-inspect==0.9.0
|
|
546 |
# llama-index-legacy
|
547 |
tzdata==2024.1
|
548 |
# via pandas
|
|
|
|
|
549 |
ujson==5.10.0
|
550 |
# via colbert-ai
|
551 |
urllib3==2.2.2
|
|
|
21 |
# httpx
|
22 |
# openai
|
23 |
# starlette
|
24 |
+
apscheduler==3.10.4
|
25 |
+
# via daily-papers (pyproject.toml)
|
26 |
async-timeout==4.0.3
|
27 |
# via
|
28 |
# aiohttp
|
|
|
408 |
python-multipart==0.0.9
|
409 |
# via gradio
|
410 |
pytz==2024.2
|
411 |
+
# via
|
412 |
+
# apscheduler
|
413 |
+
# pandas
|
414 |
pyyaml==6.0.2
|
415 |
# via
|
416 |
# datasets
|
|
|
459 |
shellingham==1.5.4
|
460 |
# via typer
|
461 |
six==1.16.0
|
462 |
+
# via
|
463 |
+
# apscheduler
|
464 |
+
# python-dateutil
|
465 |
smmap==5.0.1
|
466 |
# via gitdb
|
467 |
sniffio==1.3.1
|
|
|
552 |
# llama-index-legacy
|
553 |
tzdata==2024.1
|
554 |
# via pandas
|
555 |
+
tzlocal==5.2
|
556 |
+
# via apscheduler
|
557 |
ujson==5.10.0
|
558 |
# via colbert-ai
|
559 |
urllib3==2.2.2
|
uv.lock
CHANGED
@@ -139,6 +139,20 @@ wheels = [
|
|
139 |
{ url = "https://files.pythonhosted.org/packages/7b/a2/10639a79341f6c019dedc95bd48a4928eed9f1d1197f4c04f546fc7ae0ff/anyio-4.4.0-py3-none-any.whl", hash = "sha256:c1b2d8f46a8a812513012e1107cb0e68c17159a7a594208005a57dc776e1bdc7", size = 86780 },
|
140 |
]
|
141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
[[package]]
|
143 |
name = "async-timeout"
|
144 |
version = "4.0.3"
|
@@ -423,6 +437,7 @@ name = "daily-papers"
|
|
423 |
version = "0.1.0"
|
424 |
source = { virtual = "." }
|
425 |
dependencies = [
|
|
|
426 |
{ name = "datasets" },
|
427 |
{ name = "gradio" },
|
428 |
{ name = "gradio-calendar" },
|
@@ -436,6 +451,7 @@ dependencies = [
|
|
436 |
|
437 |
[package.metadata]
|
438 |
requires-dist = [
|
|
|
439 |
{ name = "datasets", specifier = ">=2.21.0" },
|
440 |
{ name = "gradio", specifier = ">=4.44.0" },
|
441 |
{ name = "gradio-calendar", specifier = ">=0.0.4" },
|
@@ -3032,6 +3048,18 @@ wheels = [
|
|
3032 |
{ url = "https://files.pythonhosted.org/packages/65/58/f9c9e6be752e9fcb8b6a0ee9fb87e6e7a1f6bcab2cdc73f02bb7ba91ada0/tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252", size = 345370 },
|
3033 |
]
|
3034 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3035 |
[[package]]
|
3036 |
name = "ujson"
|
3037 |
version = "5.10.0"
|
|
|
139 |
{ url = "https://files.pythonhosted.org/packages/7b/a2/10639a79341f6c019dedc95bd48a4928eed9f1d1197f4c04f546fc7ae0ff/anyio-4.4.0-py3-none-any.whl", hash = "sha256:c1b2d8f46a8a812513012e1107cb0e68c17159a7a594208005a57dc776e1bdc7", size = 86780 },
|
140 |
]
|
141 |
|
142 |
+
[[package]]
|
143 |
+
name = "apscheduler"
|
144 |
+
version = "3.10.4"
|
145 |
+
source = { registry = "https://pypi.org/simple" }
|
146 |
+
dependencies = [
|
147 |
+
{ name = "pytz" },
|
148 |
+
{ name = "six" },
|
149 |
+
{ name = "tzlocal" },
|
150 |
+
]
|
151 |
+
sdist = { url = "https://files.pythonhosted.org/packages/5e/34/5dcb368cf89f93132d9a31bd3747962a9dc874480e54333b0c09fa7d56ac/APScheduler-3.10.4.tar.gz", hash = "sha256:e6df071b27d9be898e486bc7940a7be50b4af2e9da7c08f0744a96d4bd4cef4a", size = 100832 }
|
152 |
+
wheels = [
|
153 |
+
{ url = "https://files.pythonhosted.org/packages/13/b5/7af0cb920a476dccd612fbc9a21a3745fb29b1fcd74636078db8f7ba294c/APScheduler-3.10.4-py3-none-any.whl", hash = "sha256:fb91e8a768632a4756a585f79ec834e0e27aad5860bac7eaa523d9ccefd87661", size = 59303 },
|
154 |
+
]
|
155 |
+
|
156 |
[[package]]
|
157 |
name = "async-timeout"
|
158 |
version = "4.0.3"
|
|
|
437 |
version = "0.1.0"
|
438 |
source = { virtual = "." }
|
439 |
dependencies = [
|
440 |
+
{ name = "apscheduler" },
|
441 |
{ name = "datasets" },
|
442 |
{ name = "gradio" },
|
443 |
{ name = "gradio-calendar" },
|
|
|
451 |
|
452 |
[package.metadata]
|
453 |
requires-dist = [
|
454 |
+
{ name = "apscheduler", specifier = ">=3.10.4" },
|
455 |
{ name = "datasets", specifier = ">=2.21.0" },
|
456 |
{ name = "gradio", specifier = ">=4.44.0" },
|
457 |
{ name = "gradio-calendar", specifier = ">=0.0.4" },
|
|
|
3048 |
{ url = "https://files.pythonhosted.org/packages/65/58/f9c9e6be752e9fcb8b6a0ee9fb87e6e7a1f6bcab2cdc73f02bb7ba91ada0/tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252", size = 345370 },
|
3049 |
]
|
3050 |
|
3051 |
+
[[package]]
|
3052 |
+
name = "tzlocal"
|
3053 |
+
version = "5.2"
|
3054 |
+
source = { registry = "https://pypi.org/simple" }
|
3055 |
+
dependencies = [
|
3056 |
+
{ name = "tzdata", marker = "platform_system == 'Windows'" },
|
3057 |
+
]
|
3058 |
+
sdist = { url = "https://files.pythonhosted.org/packages/04/d3/c19d65ae67636fe63953b20c2e4a8ced4497ea232c43ff8d01db16de8dc0/tzlocal-5.2.tar.gz", hash = "sha256:8d399205578f1a9342816409cc1e46a93ebd5755e39ea2d85334bea911bf0e6e", size = 30201 }
|
3059 |
+
wheels = [
|
3060 |
+
{ url = "https://files.pythonhosted.org/packages/97/3f/c4c51c55ff8487f2e6d0e618dba917e3c3ee2caae6cf0fbb59c9b1876f2e/tzlocal-5.2-py3-none-any.whl", hash = "sha256:49816ef2fe65ea8ac19d19aa7a1ae0551c834303d5014c6d5a62e4cbda8047b8", size = 17859 },
|
3061 |
+
]
|
3062 |
+
|
3063 |
[[package]]
|
3064 |
name = "ujson"
|
3065 |
version = "5.10.0"
|