Spaces:
Runtime error
Runtime error
kevin-pek
commited on
Commit
•
91855c2
1
Parent(s):
108bb17
sbert gradio interface
Browse files- .gitignore +2 -0
- README.md +8 -0
- main.py +13 -12
.gitignore
CHANGED
@@ -1 +1,3 @@
|
|
1 |
venv/
|
|
|
|
|
|
1 |
venv/
|
2 |
+
__pycache__/
|
3 |
+
|
README.md
CHANGED
@@ -1,5 +1,13 @@
|
|
1 |
# Document Semantic Search
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
## Setup
|
4 |
|
5 |
[Link to venv docs](https://docs.python.org/3/library/venv.html)
|
|
|
1 |
# Document Semantic Search
|
2 |
|
3 |
+
## Run
|
4 |
+
|
5 |
+
Run the app in reload mode with this command. This will let the app reload automatically when changes are made to the python script.
|
6 |
+
|
7 |
+
```shell
|
8 |
+
$ gradio main.py
|
9 |
+
```
|
10 |
+
|
11 |
## Setup
|
12 |
|
13 |
[Link to venv docs](https://docs.python.org/3/library/venv.html)
|
main.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
-
from haystack.nodes import PreProcessor, PDFToTextConverter, EmbeddingRetriever
|
2 |
from haystack.document_stores import InMemoryDocumentStore
|
3 |
-
from haystack.pipelines import DocumentSearchPipeline
|
4 |
import gradio as gr
|
5 |
|
6 |
preprocessor = PreProcessor(
|
@@ -12,9 +12,11 @@ preprocessor = PreProcessor(
|
|
12 |
split_respect_sentence_boundary=True,
|
13 |
split_overlap=3
|
14 |
)
|
15 |
-
document_store = InMemoryDocumentStore()
|
|
|
16 |
retriever = EmbeddingRetriever(document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2")
|
17 |
-
pipeline =
|
|
|
18 |
|
19 |
def print_answers(results):
|
20 |
fields = ["answer", "score"] # "context"
|
@@ -28,27 +30,26 @@ def print_answers(results):
|
|
28 |
return filtered_answers
|
29 |
|
30 |
def write_pdf(pdf_file):
|
31 |
-
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
|
32 |
document = converter.convert(file_path=pdf_file.name, meta=None)[0]
|
33 |
preprocessed_docs = preprocessor.process(document)
|
34 |
document_store.write_documents(preprocessed_docs)
|
|
|
35 |
|
36 |
def predict(question, pdf_file):
|
37 |
-
print("Start processing pdf")
|
38 |
write_pdf(pdf_file)
|
39 |
-
print("Processing done.")
|
40 |
result = pipeline.run(query=question, params={"Retriever": { "top_k": 2 }})
|
41 |
answers = print_answers(result)
|
42 |
return answers
|
43 |
|
44 |
-
title = "Search"
|
45 |
interface = gr.Interface(
|
46 |
fn=predict,
|
47 |
-
inputs=[
|
|
|
|
|
|
|
48 |
outputs="text",
|
49 |
-
title=
|
50 |
-
|
51 |
-
interpretation="default",
|
52 |
theme="default" # “default", “huggingface", “dark-grass", “peach"
|
53 |
)
|
54 |
|
|
|
1 |
+
from haystack.nodes import PreProcessor, PDFToTextConverter, EmbeddingRetriever, TransformersReader
|
2 |
from haystack.document_stores import InMemoryDocumentStore
|
3 |
+
from haystack.pipelines import DocumentSearchPipeline, ExtractiveQAPipeline
|
4 |
import gradio as gr
|
5 |
|
6 |
preprocessor = PreProcessor(
|
|
|
12 |
split_respect_sentence_boundary=True,
|
13 |
split_overlap=3
|
14 |
)
|
15 |
+
document_store = InMemoryDocumentStore(embedding_dim=384)
|
16 |
+
reader = TransformersReader("sentence-transformers/all-MiniLM-L6-v2")
|
17 |
retriever = EmbeddingRetriever(document_store=document_store, embedding_model="sentence-transformers/all-MiniLM-L6-v2")
|
18 |
+
pipeline = ExtractiveQAPipeline(reader, retriever)
|
19 |
+
converter = PDFToTextConverter(remove_numeric_tables=True)
|
20 |
|
21 |
def print_answers(results):
|
22 |
fields = ["answer", "score"] # "context"
|
|
|
30 |
return filtered_answers
|
31 |
|
32 |
def write_pdf(pdf_file):
|
|
|
33 |
document = converter.convert(file_path=pdf_file.name, meta=None)[0]
|
34 |
preprocessed_docs = preprocessor.process(document)
|
35 |
document_store.write_documents(preprocessed_docs)
|
36 |
+
document_store.update_embeddings(retriever)
|
37 |
|
38 |
def predict(question, pdf_file):
|
|
|
39 |
write_pdf(pdf_file)
|
|
|
40 |
result = pipeline.run(query=question, params={"Retriever": { "top_k": 2 }})
|
41 |
answers = print_answers(result)
|
42 |
return answers
|
43 |
|
|
|
44 |
interface = gr.Interface(
|
45 |
fn=predict,
|
46 |
+
inputs=[
|
47 |
+
gr.components.Textbox(lines = 1, label="Enter your search query here..."),
|
48 |
+
gr.components.File(file_count="single", type="file", label="Upload a file here.")
|
49 |
+
],
|
50 |
outputs="text",
|
51 |
+
title="Search",
|
52 |
+
interpretation=None,
|
|
|
53 |
theme="default" # “default", “huggingface", “dark-grass", “peach"
|
54 |
)
|
55 |
|