Spaces:

christopher
/

imdb-search-demo

Runtime error

App Files Files Community

cakiki commited on Aug 6, 2023

Commit

aeb12b8

•

1 Parent(s): a74a6e9

Upload folder using huggingface_hub

Browse files

Files changed (32) hide show

.gitattributes +2 -35
README.md +6 -5
app.py +94 -0
data/.gitkeep +0 -0
data/data-00000-of-00001.arrow +3 -0
data/dataset_info.json +63 -0
data/state.json +13 -0
index/.gitkeep +0 -0
index/_5.fdm +0 -0
index/_5.fdt +0 -0
index/_5.fdx +0 -0
index/_5.fnm +0 -0
index/_5.nvd +0 -0
index/_5.nvm +0 -0
index/_5.si +0 -0
index/_5_Lucene90_0.doc +3 -0
index/_5_Lucene90_0.dvd +0 -0
index/_5_Lucene90_0.dvm +0 -0
index/_5_Lucene90_0.pos +3 -0
index/_5_Lucene90_0.tim +3 -0
index/_5_Lucene90_0.tip +0 -0
index/_5_Lucene90_0.tmd +0 -0
index/segments_2 +0 -0
index/write.lock +0 -0
packages.txt +1 -0
requirements.txt +4 -0
spacerini_utils/__init__.py +0 -0
spacerini_utils/__pycache__/__init__.cpython-39.pyc +0 -0
spacerini_utils/__pycache__/index.cpython-39.pyc +0 -0
spacerini_utils/__pycache__/search.cpython-39.pyc +0 -0
spacerini_utils/index.py +20 -0
spacerini_utils/search.py +135 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,2 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


1	+ index/*/ filter=lfs diff=lfs merge=lfs -text
2	+ data/data-00000-of-00001.arrow filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,12 +1,13 @@
 ---
-title: Imdb Search Demo
-emoji: 💻
 colorFrom: blue
-colorTo: gray
 sdk: gradio
-sdk_version: 3.39.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: IMDB search
+emoji: 🐠
 colorFrom: blue
+colorTo: blue
 sdk: gradio
+sdk_version: 3.29.0
 app_file: app.py
 pinned: false
+license: apache-2.0
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import gradio as gr
+from datasets import load_from_disk
+from pyserini.search.lucene import LuceneSearcher
+searcher = LuceneSearcher("index")
+ds = load_from_disk("data")
+NUM_PAGES = 10 # STATIC. THIS CAN'T CHANGE BECAUSE GRADIO CAN'T DYNAMICALLY CREATE COMPONENTS.
+RESULTS_PER_PAGE = 5
+TEXT_FIELD = "text"
+METADATA_FIELD = "label"
+def result_html(result, meta):
+    return (
+        f"<div style=\"color:#2a5cb3;font-weight: 500\"><u>{meta}</u></div><br>"
+        f"<div><details><summary>{result[:250]}...</summary><p>{result[250:]}</p></details></div><br><hr><br>"
+    )
+def format_results(results):
+    return "\n".join([result_html(result, meta) for result,meta in zip(results[TEXT_FIELD], results[METADATA_FIELD])])
+def page_0(query):
+    hits = searcher.search(query, k=NUM_PAGES*RESULTS_PER_PAGE)
+    ix = [int(hit.docid) for hit in hits]
+    results = ds.select(ix).shard(num_shards=NUM_PAGES, index=0, contiguous=True) # no need to shard. split ix in batches instead. (would make sense if results was cacheable)
+    results = format_results(results)
+    return results, [ix], gr.update(visible=True)
+def page_i(i, ix):
+    ix = ix[0]
+    results = ds.select(ix).shard(num_shards=NUM_PAGES, index=i, contiguous=True)
+    results = format_results(results)
+    return results, [ix]
+with gr.Blocks(css="#b {min-width:15px;background:transparent;border:white;box-shadow:none;}") as demo: #
+    with gr.Row():
+        gr.Markdown(value="""## <p style="text-align: center;"> IMDB search </p>""")
+    with gr.Row():
+        with gr.Column(scale=1):
+            result_list = gr.Dataframe(type="array", visible=False, col_count=1)
+        with gr.Column(scale=13):
+            query = gr.Textbox(lines=1, max_lines=1, placeholder="Search…", label="")
+        with gr.Column(scale=1):
+            with gr.Row(scale=1):
+                pass
+            with gr.Row(scale=1):
+                submit_btn = gr.Button("🔍", elem_id="b").style(full_width=False)
+            with gr.Row(scale=1):
+                pass
+    with gr.Row():
+        with gr.Column(scale=1):
+            pass
+        with gr.Column(scale=13):
+            c = gr.HTML(label="Results")
+            with gr.Row(visible=False) as pagination:
+                # left = gr.Button(value="◀", elem_id="b", visible=False).style(full_width=True)
+                page_1 = gr.Button(value="1", elem_id="b").style(full_width=True)
+                page_2 = gr.Button(value="2", elem_id="b").style(full_width=True)
+                page_3 = gr.Button(value="3", elem_id="b").style(full_width=True)
+                page_4 = gr.Button(value="4", elem_id="b").style(full_width=True)
+                page_5 = gr.Button(value="5", elem_id="b").style(full_width=True)
+                page_6 = gr.Button(value="6", elem_id="b").style(full_width=True)
+                page_7 = gr.Button(value="7", elem_id="b").style(full_width=True)
+                page_8 = gr.Button(value="8", elem_id="b").style(full_width=True)
+                page_9 = gr.Button(value="9", elem_id="b").style(full_width=True)
+                page_10 = gr.Button(value="10", elem_id="b").style(full_width=True)
+                # right = gr.Button(value="▶", elem_id="b", visible=False).style(full_width=True)
+        with gr.Column(scale=1):
+            pass
+    query.submit(fn=page_0, inputs=[query], outputs=[c, result_list, pagination])
+    submit_btn.click(page_0, inputs=[query], outputs=[c, result_list, pagination])
+    with gr.Box(visible=False):
+        nums = [gr.Number(i, visible=False, precision=0) for i in range(NUM_PAGES)]
+    page_1.click(fn=page_i, inputs=[nums[0], result_list], outputs=[c, result_list])
+    page_2.click(fn=page_i, inputs=[nums[1], result_list], outputs=[c, result_list])
+    page_3.click(fn=page_i, inputs=[nums[2], result_list], outputs=[c, result_list])
+    page_4.click(fn=page_i, inputs=[nums[3], result_list], outputs=[c, result_list])
+    page_5.click(fn=page_i, inputs=[nums[4], result_list], outputs=[c, result_list])
+    page_6.click(fn=page_i, inputs=[nums[5], result_list], outputs=[c, result_list])
+    page_7.click(fn=page_i, inputs=[nums[6], result_list], outputs=[c, result_list])
+    page_8.click(fn=page_i, inputs=[nums[7], result_list], outputs=[c, result_list])
+    page_9.click(fn=page_i, inputs=[nums[8], result_list], outputs=[c, result_list])
+    page_10.click(fn=page_i, inputs=[nums[9], result_list], outputs=[c, result_list])
+demo.launch(enable_queue=True, debug=True)

data/.gitkeep ADDED Viewed

File without changes

data/data-00000-of-00001.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ccad9a2cace25de89d2198d210317033e23de1a8175d1be4faafec3ceb187b20
+size 66094864

data/dataset_info.json ADDED Viewed

	@@ -0,0 +1,63 @@

+{
+  "builder_name": "imdb",
+  "citation": "@InProceedings{maas-EtAl:2011:ACL-HLT2011,\n  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},\n  title     = {Learning Word Vectors for Sentiment Analysis},\n  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},\n  month     = {June},\n  year      = {2011},\n  address   = {Portland, Oregon, USA},\n  publisher = {Association for Computational Linguistics},\n  pages     = {142--150},\n  url       = {http://www.aclweb.org/anthology/P11-1015}\n}\n",
+  "config_name": "plain_text",
+  "dataset_size": 133190302,
+  "description": "Large Movie Review Dataset.\nThis is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.",
+  "download_checksums": {
+    "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz": {
+      "num_bytes": 84125825,
+      "checksum": null
+    }
+  },
+  "download_size": 84125825,
+  "features": {
+    "text": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "label": {
+      "names": [
+        "neg",
+        "pos"
+      ],
+      "_type": "ClassLabel"
+    }
+  },
+  "homepage": "http://ai.stanford.edu/~amaas/data/sentiment/",
+  "license": "",
+  "size_in_bytes": 217316127,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 33432823,
+      "num_examples": 25000,
+      "dataset_name": "imdb"
+    },
+    "test": {
+      "name": "test",
+      "num_bytes": 32650685,
+      "num_examples": 25000,
+      "dataset_name": "imdb"
+    },
+    "unsupervised": {
+      "name": "unsupervised",
+      "num_bytes": 67106794,
+      "num_examples": 50000,
+      "dataset_name": "imdb"
+    }
+  },
+  "task_templates": [
+    {
+      "task": "text-classification",
+      "label_column": "label"
+    }
+  ],
+  "version": {
+    "version_str": "1.0.0",
+    "description": "",
+    "major": 1,
+    "minor": 0,
+    "patch": 0
+  }
+}

data/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "079439276ed0dd9a",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "train+test"
+}

index/.gitkeep ADDED Viewed

File without changes

index/_5.fdm ADDED Viewed

Binary file (158 Bytes). View file

index/_5.fdt ADDED Viewed

Binary file (284 kB). View file

index/_5.fdx ADDED Viewed

Binary file (247 Bytes). View file

index/_5.fnm ADDED Viewed

Binary file (322 Bytes). View file

index/_5.nvd ADDED Viewed

Binary file (50.1 kB). View file

index/_5.nvm ADDED Viewed

Binary file (103 Bytes). View file

index/_5.si ADDED Viewed

Binary file (534 Bytes). View file

index/_5_Lucene90_0.doc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c51f857a0b58e236592cfe90fe106dcf7a5d0834dfea20e0d26be498e696bd1d
+size 7539565

index/_5_Lucene90_0.dvd ADDED Viewed

Binary file (314 kB). View file

index/_5_Lucene90_0.dvm ADDED Viewed

Binary file (171 Bytes). View file

index/_5_Lucene90_0.pos ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b9e28efcc32c72172e4e2f83cf00dddd5cbfe8f9c25289d806022c3a97f14254
+size 9575136

index/_5_Lucene90_0.tim ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:94b77d8179588eba0fec5f782efd2d42466182b5988e5463683c148ea415b087
+size 1043047

index/_5_Lucene90_0.tip ADDED Viewed

Binary file (26.6 kB). View file

index/_5_Lucene90_0.tmd ADDED Viewed

Binary file (262 Bytes). View file

index/segments_2 ADDED Viewed

Binary file (154 Bytes). View file

index/write.lock ADDED Viewed

File without changes

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ default-jdk

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+pyserini
+datasets
+faiss-cpu
+torch

spacerini_utils/__init__.py ADDED Viewed

File without changes

spacerini_utils/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (161 Bytes). View file

spacerini_utils/__pycache__/index.cpython-39.pyc ADDED Viewed

Binary file (769 Bytes). View file

spacerini_utils/__pycache__/search.cpython-39.pyc ADDED Viewed

Binary file (4.41 kB). View file

spacerini_utils/index.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import os
+from typing import Any
+from typing import Dict
+from pyserini.index.lucene import IndexReader
+def fetch_index_stats(index_path: str) -> Dict[str, Any]:
+    """
+    Fetch index statistics
+    index_path : str
+        Path to index directory
+    Returns
+    -------
+    Dictionary of index statistics
+    Dictionary Keys ==> total_terms, documents, unique_terms
+    """
+    assert os.path.exists(index_path), f"Index path {index_path} does not exist"
+    index_reader = IndexReader(index_path)
+    return index_reader.stats()

spacerini_utils/search.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import json
+from typing import List, Literal, Protocol, Tuple, TypedDict, Union
+from pyserini.analysis import get_lucene_analyzer
+from pyserini.index import IndexReader
+from pyserini.search import DenseSearchResult, JLuceneSearcherResult
+from pyserini.search.faiss.__main__ import init_query_encoder
+from pyserini.search.faiss import FaissSearcher
+from pyserini.search.hybrid import HybridSearcher
+from pyserini.search.lucene import LuceneSearcher
+EncoderClass = Literal["dkrr", "dpr", "tct_colbert", "ance", "sentence", "contriever", "auto"]
+class AnalyzerArgs(TypedDict):
+    language: str
+    stemming: bool
+    stemmer: str
+    stopwords: bool
+    huggingFaceTokenizer: str
+class SearchResult(TypedDict):
+    docid: str
+    text: str
+    score: float
+    language: str
+class Searcher(Protocol):
+    def search(self, query: str, **kwargs) -> List[Union[DenseSearchResult, JLuceneSearcherResult]]:
+        ...
+def init_searcher_and_reader(
+    sparse_index_path: str = None,
+    bm25_k1: float = None,
+    bm25_b: float = None,
+    analyzer_args: AnalyzerArgs = None,
+    dense_index_path: str = None,
+    encoder_name_or_path: str = None,
+    encoder_class: EncoderClass = None,
+    tokenizer_name: str = None,
+    device: str = None,
+    prefix: str = None
+) -> Tuple[Union[FaissSearcher, HybridSearcher, LuceneSearcher], IndexReader]:
+    """
+    Initialize and return an approapriate searcher
+    Parameters
+    ----------
+    sparse_index_path: str
+        Path to sparse index
+    dense_index_path: str
+        Path to dense index
+    encoder_name_or_path: str
+        Path to query encoder checkpoint or encoder name
+    encoder_class: str
+        Query encoder class to use. If None, infer from `encoder`
+    tokenizer_name: str
+        Tokenizer name or path
+    device: str
+        Device to load Query encoder on.
+    prefix: str
+        Query prefix if exists
+    Returns
+    -------
+    Searcher: FaissSearcher | HybridSearcher | LuceneSearcher
+        A sparse, dense or hybrid searcher
+    """
+    reader = None
+    if sparse_index_path:
+        ssearcher = LuceneSearcher(sparse_index_path)
+        if analyzer_args:
+            analyzer = get_lucene_analyzer(**analyzer_args)
+            ssearcher.set_analyzer(analyzer)
+            if bm25_k1 and bm25_b:
+                ssearcher.set_bm25(bm25_k1, bm25_b)
+    if dense_index_path:
+        encoder = init_query_encoder(
+            encoder=encoder_name_or_path,
+            encoder_class=encoder_class,
+            tokenizer_name=tokenizer_name,
+            topics_name=None,
+            encoded_queries=None,
+            device=device,
+            prefix=prefix
+        )
+        reader = IndexReader(sparse_index_path)
+        dsearcher = FaissSearcher(dense_index_path, encoder)
+        if sparse_index_path:
+            hsearcher = HybridSearcher(dense_searcher=dsearcher, sparse_searcher=ssearcher)
+            return hsearcher, reader
+        else:
+            return dsearcher, reader
+    return ssearcher, reader
+def _search(searcher: Searcher, reader: IndexReader, query: str, num_results: int = 10) -> List[SearchResult]:
+    """
+    Parameters:
+    -----------
+    searcher: FaissSearcher | HybridSearcher | LuceneSearcher
+        A sparse, dense or hybrid searcher
+    query: str
+        Query for which to retrieve results
+    num_results: int
+        Maximum number of results to retrieve
+    Returns:
+    --------
+    Dict:
+    """
+    def _get_dict(r: Union[DenseSearchResult, JLuceneSearcherResult]):
+        if isinstance(r, JLuceneSearcherResult):
+            return json.loads(r.raw)
+        elif isinstance(r, DenseSearchResult):
+            # Get document from sparse_index using index reader
+            return json.loads(reader.doc(r.docid).raw())
+    search_results = searcher.search(query, k=num_results)
+    all_results = [
+        SearchResult(
+            docid=result["id"],
+            text=result["contents"],
+            score=search_results[idx].score
+        ) for idx, result in enumerate(map(lambda r: _get_dict(r), search_results))
+    ]
+    return all_results