Spaces:

hysts
/

daily-papers

Running on CPU Upgrade

App Files Files Community

hysts HF staff commited on Mar 13

Commit

d940698

•

1 Parent(s): 8e3b16f

Update

Browse files

Files changed (4) hide show

.gitignore +162 -0
app.py +28 -3
papers.py +32 -0
requirements.txt +2 -1

.gitignore ADDED Viewed

	@@ -0,0 +1,162 @@

+.ragatouille/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

app.py CHANGED Viewed

@@ -8,6 +8,14 @@ from papers import PaperList, get_df
 DESCRIPTION = "# [Daily Papers](https://huggingface.co/papers)"
 paper_list = PaperList(get_df())
@@ -18,10 +26,25 @@ def update_num_papers(df: pd.DataFrame) -> str:
 with gr.Blocks(css="style.css") as demo:
     gr.Markdown(DESCRIPTION)
     with gr.Group():
         with gr.Row():
             start_date = Calendar(label="Start date", type="datetime", value="2023-05-05")
             end_date = Calendar(label="End date", type="datetime")
-        search_title = gr.Textbox(label="Search title")
     num_papers = gr.Textbox(label="Number of papers", value=update_num_papers(paper_list.df_raw), interactive=False)
     df = gr.Dataframe(
@@ -35,10 +58,12 @@ with gr.Blocks(css="style.css") as demo:
         wrap=True,
     )
     gr.on(
-        triggers=[start_date.change, end_date.change, search_title.submit],
         fn=paper_list.search,
-        inputs=[start_date, end_date, search_title],
         outputs=df,
         api_name=False,
     ).then(

 DESCRIPTION = "# [Daily Papers](https://huggingface.co/papers)"
+FOOT_NOTE = """\
+Related useful Spaces:
+- [Semantic Scholar Paper Recommender](https://huggingface.co/spaces/librarian-bots/recommend_similar_papers) by [davanstrien](https://huggingface.co/davanstrien)
+- [ArXiv CS RAG](https://huggingface.co/spaces/bishmoy/Arxiv-CS-RAG) by [bishmoy](https://huggingface.co/bishmoy)
+- [Paper Q&A](https://huggingface.co/spaces/chansung/paper_qa) by [chansung](https://huggingface.co/chansung)
+"""
 paper_list = PaperList(get_df())
 with gr.Blocks(css="style.css") as demo:
     gr.Markdown(DESCRIPTION)
     with gr.Group():
+        search_title = gr.Textbox(label="Search title")
+        with gr.Row():
+            with gr.Column(scale=4):
+                search_abstract = gr.Textbox(
+                    label="Search abstract",
+                    info="The result may not be accurate as the abstract does not contain all the information.",
+                )
+            with gr.Column(scale=1):
+                max_num_to_retrieve = gr.Slider(
+                    label="Max number to retrieve",
+                    info="This is used only for search on abstracts.",
+                    minimum=1,
+                    maximum=len(paper_list.df_raw),
+                    step=1,
+                    value=100,
+                )
         with gr.Row():
             start_date = Calendar(label="Start date", type="datetime", value="2023-05-05")
             end_date = Calendar(label="End date", type="datetime")
     num_papers = gr.Textbox(label="Number of papers", value=update_num_papers(paper_list.df_raw), interactive=False)
     df = gr.Dataframe(
         wrap=True,
     )
+    gr.Markdown(FOOT_NOTE)
     gr.on(
+        triggers=[start_date.change, end_date.change, search_title.submit, search_abstract.submit],
         fn=paper_list.search,
+        inputs=[start_date, end_date, search_title, search_abstract, max_num_to_retrieve],
         outputs=df,
         api_name=False,
     ).then(

papers.py CHANGED Viewed

@@ -5,6 +5,20 @@ import operator
 import datasets
 import pandas as pd
 import tqdm.auto
 @dataclasses.dataclass(frozen=True)
@@ -92,6 +106,8 @@ class PaperList:
         start_date: datetime.datetime,
         end_date: datetime.datetime,
         title_search_query: str,
     ) -> pd.DataFrame:
         df = self.df_raw.copy()
         df["date"] = pd.to_datetime(df["date"])
@@ -103,5 +119,21 @@ class PaperList:
         # Filter by title
         df = df[df["title"].str.contains(title_search_query, case=False)]
         df_prettified = self._prettifier(df).loc[:, self.column_names]
         return df_prettified

 import datasets
 import pandas as pd
 import tqdm.auto
+from huggingface_hub import HfApi
+from ragatouille import RAGPretrainedModel
+api = HfApi()
+INDEX_DIR_PATH = ".ragatouille/colbert/indexes/daily-papers-abstract-index/"
+api.snapshot_download(
+    repo_id="hysts-bot-data/daily-papers-abstract-index",
+    repo_type="dataset",
+    local_dir=INDEX_DIR_PATH,
+)
+ABSTRACT_RETRIEVER = RAGPretrainedModel.from_index(INDEX_DIR_PATH)
+# Run once to initialize the retriever
+ABSTRACT_RETRIEVER.search("LLM")
 @dataclasses.dataclass(frozen=True)
         start_date: datetime.datetime,
         end_date: datetime.datetime,
         title_search_query: str,
+        abstract_search_query: str,
+        max_num_to_retrieve: int,
     ) -> pd.DataFrame:
         df = self.df_raw.copy()
         df["date"] = pd.to_datetime(df["date"])
         # Filter by title
         df = df[df["title"].str.contains(title_search_query, case=False)]
+        # Filter by abstract
+        if abstract_search_query:
+            results = ABSTRACT_RETRIEVER.search(abstract_search_query, k=max_num_to_retrieve)
+            remaining_ids = set(df["arxiv_id"])
+            found_id_set = set()
+            found_ids = []
+            for x in results:
+                arxiv_id = x["document_id"]
+                if arxiv_id not in remaining_ids:
+                    continue
+                if arxiv_id in found_id_set:
+                    continue
+                found_id_set.add(arxiv_id)
+                found_ids.append(arxiv_id)
+            df = df[df["arxiv_id"].isin(found_ids)].set_index("arxiv_id").reindex(index=found_ids).reset_index()
         df_prettified = self._prettifier(df).loc[:, self.column_names]
         return df_prettified

requirements.txt CHANGED Viewed

@@ -1,6 +1,7 @@
 datasets==2.18.0
-gradio==4.21.0
 gradio_calendar==0.0.4
 huggingface_hub==0.21.4
 pandas==2.2.0
 tqdm==4.66.2

 datasets==2.18.0
+#gradio==4.21.0
 gradio_calendar==0.0.4
 huggingface_hub==0.21.4
 pandas==2.2.0
+ragatouille==0.0.7.post10
 tqdm==4.66.2