ugmSorcero commited on
Commit
cfc1673
2 Parent(s): 42468fb c9524e4

Merge branch 'main' into feature/audio_output

Browse files
.gitignore CHANGED
@@ -128,6 +128,4 @@ dmypy.json
128
  # Pyre type checker
129
  .pyre/
130
 
131
- .vscode/
132
-
133
- data/audio/
 
128
  # Pyre type checker
129
  .pyre/
130
 
131
+ .vscode/
 
 
.streamlit/config.toml CHANGED
@@ -1,5 +1,5 @@
1
  [theme]
2
- primaryColor="#ffbf00"
3
  backgroundColor="#0e1117"
4
  secondaryBackgroundColor="#282929"
5
  textColor = "#ffffff"
 
1
  [theme]
2
+ primaryColor="#e5ab00"
3
  backgroundColor="#0e1117"
4
  secondaryBackgroundColor="#282929"
5
  textColor = "#ffffff"
app.py CHANGED
@@ -29,7 +29,7 @@ def run_demo():
29
  with navigation:
30
 
31
  selected_page = option_menu(
32
- menu_title="Navigation",
33
  options=list(pages.keys()),
34
  icons=[f[1] for f in pages.values()],
35
  menu_icon="cast",
 
29
  with navigation:
30
 
31
  selected_page = option_menu(
32
+ menu_title=None,
33
  options=list(pages.keys()),
34
  icons=[f[1] for f in pages.values()],
35
  menu_icon="cast",
core/pipelines.py CHANGED
@@ -25,6 +25,8 @@ def keyword_search(index="documents", split_word_length=100, audio_output=False)
25
 
26
  - Documents that have more lexical overlap with the query are more likely to be relevant
27
  - Words that occur in fewer documents are more significant than words that occur in many documents
 
 
28
  """
29
  document_store = InMemoryDocumentStore(index=index)
30
  keyword_retriever = TfidfRetriever(document_store=(document_store))
@@ -45,10 +47,7 @@ def keyword_search(index="documents", split_word_length=100, audio_output=False)
45
  index_pipeline = Pipeline()
46
  index_pipeline.add_node(processor, name="Preprocessor", inputs=["File"])
47
  index_pipeline.add_node(
48
- keyword_retriever, name="TfidfRetriever", inputs=["Preprocessor"]
49
- )
50
- index_pipeline.add_node(
51
- document_store, name="DocumentStore", inputs=["TfidfRetriever"]
52
  )
53
 
54
  if audio_output:
@@ -68,6 +67,7 @@ def dense_passage_retrieval(
68
  split_word_length=100,
69
  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
70
  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
 
71
  ):
72
  """
73
  **Dense Passage Retrieval Pipeline**
@@ -104,6 +104,15 @@ def dense_passage_retrieval(
104
  index_pipeline.add_node(
105
  document_store, name="DocumentStore", inputs=["DPRRetriever"]
106
  )
 
 
 
 
 
 
 
 
 
107
 
108
  return search_pipeline, index_pipeline
109
 
 
25
 
26
  - Documents that have more lexical overlap with the query are more likely to be relevant
27
  - Words that occur in fewer documents are more significant than words that occur in many documents
28
+
29
+ :warning: **(HAYSTACK BUG) Keyword Search doesn't work if you reindex:** Please refresh page in order to reindex
30
  """
31
  document_store = InMemoryDocumentStore(index=index)
32
  keyword_retriever = TfidfRetriever(document_store=(document_store))
 
47
  index_pipeline = Pipeline()
48
  index_pipeline.add_node(processor, name="Preprocessor", inputs=["File"])
49
  index_pipeline.add_node(
50
+ document_store, name="DocumentStore", inputs=["Preprocessor"]
 
 
 
51
  )
52
 
53
  if audio_output:
 
67
  split_word_length=100,
68
  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
69
  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
70
+ audio_output=False
71
  ):
72
  """
73
  **Dense Passage Retrieval Pipeline**
 
104
  index_pipeline.add_node(
105
  document_store, name="DocumentStore", inputs=["DPRRetriever"]
106
  )
107
+
108
+ if audio_output:
109
+ doc2speech = DocumentToSpeech(
110
+ model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
111
+ generated_audio_dir=Path(data_path + "audio"),
112
+ )
113
+ search_pipeline.add_node(
114
+ doc2speech, name="DocumentToSpeech", inputs=["DPRRetriever"]
115
+ )
116
 
117
  return search_pipeline, index_pipeline
118
 
core/search_index.py CHANGED
@@ -1,4 +1,5 @@
1
  from haystack.schema import Document
 
2
  import uuid
3
 
4
 
@@ -17,8 +18,12 @@ def format_docs(documents):
17
  return db_docs, [doc.meta["id"] for doc in db_docs]
18
 
19
 
20
- def index(documents, pipeline):
21
  documents, doc_ids = format_docs(documents)
 
 
 
 
22
  pipeline.run(documents=documents)
23
  return doc_ids
24
 
@@ -36,6 +41,7 @@ def search(queries, pipeline):
36
  "text": res.content,
37
  "id": res.meta["id"],
38
  "fragment_id": res.id,
 
39
  }
40
  if not score_is_empty:
41
  match.update({"score": res.score})
 
1
  from haystack.schema import Document
2
+ from haystack.document_stores import BaseDocumentStore
3
  import uuid
4
 
5
 
 
18
  return db_docs, [doc.meta["id"] for doc in db_docs]
19
 
20
 
21
+ def index(documents, pipeline, clear_index=True):
22
  documents, doc_ids = format_docs(documents)
23
+ if clear_index:
24
+ document_stores = pipeline.get_nodes_by_class(class_type=BaseDocumentStore)
25
+ for docstore in document_stores:
26
+ docstore.delete_index(docstore.index)
27
  pipeline.run(documents=documents)
28
  return doc_ids
29
 
 
41
  "text": res.content,
42
  "id": res.meta["id"],
43
  "fragment_id": res.id,
44
+ "meta": res.meta,
45
  }
46
  if not score_is_empty:
47
  match.update({"score": res.score})
interface/components.py CHANGED
@@ -42,11 +42,15 @@ def component_select_pipeline(container):
42
  "index_pipeline": index_pipeline,
43
  "doc": pipeline_funcs[index_pipe].__doc__,
44
  }
 
45
 
46
 
47
  def component_show_pipeline(pipeline, pipeline_name):
48
  """Draw the pipeline"""
49
- with st.expander("Show pipeline"):
 
 
 
50
  if pipeline["doc"] is not None:
51
  st.markdown(pipeline["doc"])
52
  fig = get_pipeline_graph(pipeline[pipeline_name])
@@ -59,6 +63,8 @@ def component_show_search_result(container, results):
59
  st.markdown(f"### Match {idx+1}")
60
  st.markdown(f"**Text**: {document['text']}")
61
  st.markdown(f"**Document**: {document['id']}")
 
 
62
  if "score" in document:
63
  st.markdown(f"**Score**: {document['score']:.3f}")
64
  if "content_audio" in document:
@@ -66,36 +72,32 @@ def component_show_search_result(container, results):
66
  st.markdown("---")
67
 
68
 
69
- def component_text_input(container):
70
  """Draw the Text Input widget"""
71
  with container:
72
  texts = []
73
- doc_id = 1
74
  with st.expander("Enter documents"):
75
  while True:
76
  text = st.text_input(f"Document {doc_id}", key=doc_id)
77
  if text != "":
78
- texts.append({"text": text})
79
  doc_id += 1
80
  st.markdown("---")
81
  else:
82
  break
83
- corpus = [
84
- {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(texts)
85
- ]
86
- return corpus
87
 
88
 
89
- def component_article_url(container):
90
  """Draw the Article URL widget"""
91
  with container:
92
  urls = []
93
- doc_id = 1
94
  with st.expander("Enter URLs"):
95
  while True:
96
  url = st.text_input(f"URL {doc_id}", key=doc_id)
97
  if url != "":
98
- urls.append({"text": extract_text_from_url(url)})
99
  doc_id += 1
100
  st.markdown("---")
101
  else:
@@ -103,19 +105,16 @@ def component_article_url(container):
103
 
104
  for idx, doc in enumerate(urls):
105
  with st.expander(f"Preview URL {idx}"):
106
- st.write(doc)
107
 
108
- corpus = [
109
- {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
110
- ]
111
- return corpus
112
 
113
 
114
- def component_file_input(container):
115
  """Draw the extract text from file widget"""
116
  with container:
117
  files = []
118
- doc_id = 1
119
  with st.expander("Enter Files"):
120
  while True:
121
  file = st.file_uploader(
@@ -124,7 +123,7 @@ def component_file_input(container):
124
  if file != None:
125
  extracted_text = extract_text_from_file(file)
126
  if extracted_text != None:
127
- files.append({"text": extracted_text})
128
  doc_id += 1
129
  st.markdown("---")
130
  else:
@@ -134,9 +133,7 @@ def component_file_input(container):
134
 
135
  for idx, doc in enumerate(files):
136
  with st.expander(f"Preview File {idx}"):
137
- st.write(doc)
138
 
139
- corpus = [
140
- {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(files)
141
- ]
142
- return corpus
 
42
  "index_pipeline": index_pipeline,
43
  "doc": pipeline_funcs[index_pipe].__doc__,
44
  }
45
+ st.session_state["doc_id"] = 0
46
 
47
 
48
  def component_show_pipeline(pipeline, pipeline_name):
49
  """Draw the pipeline"""
50
+ expander_text = "Show pipeline"
51
+ if pipeline["doc"] is not None and "BUG" in pipeline["doc"]:
52
+ expander_text += " ⚠️"
53
+ with st.expander(expander_text):
54
  if pipeline["doc"] is not None:
55
  st.markdown(pipeline["doc"])
56
  fig = get_pipeline_graph(pipeline[pipeline_name])
 
63
  st.markdown(f"### Match {idx+1}")
64
  st.markdown(f"**Text**: {document['text']}")
65
  st.markdown(f"**Document**: {document['id']}")
66
+ if "_split_id" in document["meta"]:
67
+ st.markdown(f"**Document Chunk**: {document['meta']['_split_id']}")
68
  if "score" in document:
69
  st.markdown(f"**Score**: {document['score']:.3f}")
70
  if "content_audio" in document:
 
72
  st.markdown("---")
73
 
74
 
75
+ def component_text_input(container, doc_id):
76
  """Draw the Text Input widget"""
77
  with container:
78
  texts = []
 
79
  with st.expander("Enter documents"):
80
  while True:
81
  text = st.text_input(f"Document {doc_id}", key=doc_id)
82
  if text != "":
83
+ texts.append({"text": text, "doc_id": doc_id})
84
  doc_id += 1
85
  st.markdown("---")
86
  else:
87
  break
88
+ corpus = [{"text": doc["text"], "id": doc["doc_id"]} for doc in texts]
89
+ return corpus, doc_id
 
 
90
 
91
 
92
+ def component_article_url(container, doc_id):
93
  """Draw the Article URL widget"""
94
  with container:
95
  urls = []
 
96
  with st.expander("Enter URLs"):
97
  while True:
98
  url = st.text_input(f"URL {doc_id}", key=doc_id)
99
  if url != "":
100
+ urls.append({"text": extract_text_from_url(url), "doc_id": doc_id})
101
  doc_id += 1
102
  st.markdown("---")
103
  else:
 
105
 
106
  for idx, doc in enumerate(urls):
107
  with st.expander(f"Preview URL {idx}"):
108
+ st.write(doc["text"])
109
 
110
+ corpus = [{"text": doc["text"], "id": doc["doc_id"]} for doc in urls]
111
+ return corpus, doc_id
 
 
112
 
113
 
114
+ def component_file_input(container, doc_id):
115
  """Draw the extract text from file widget"""
116
  with container:
117
  files = []
 
118
  with st.expander("Enter Files"):
119
  while True:
120
  file = st.file_uploader(
 
123
  if file != None:
124
  extracted_text = extract_text_from_file(file)
125
  if extracted_text != None:
126
+ files.append({"text": extracted_text, "doc_id": doc_id})
127
  doc_id += 1
128
  st.markdown("---")
129
  else:
 
133
 
134
  for idx, doc in enumerate(files):
135
  with st.expander(f"Preview File {idx}"):
136
+ st.write(doc["text"])
137
 
138
+ corpus = [{"text": doc["text"], "id": doc["doc_id"]} for doc in files]
139
+ return corpus, doc_id
 
 
interface/config.py CHANGED
@@ -1,7 +1,11 @@
1
  from interface.pages import page_landing_page, page_search, page_index
2
 
3
  # Define default Session Variables over the whole session.
4
- session_state_variables = {"pipeline": None, "pipeline_func_parameters": []}
 
 
 
 
5
 
6
  # Define Pages for the demo
7
  pages = {
 
1
  from interface.pages import page_landing_page, page_search, page_index
2
 
3
  # Define default Session Variables over the whole session.
4
+ session_state_variables = {
5
+ "pipeline": None,
6
+ "pipeline_func_parameters": [],
7
+ "doc_id": 0,
8
+ }
9
 
10
  # Define Pages for the demo
11
  pages = {
interface/pages.py CHANGED
@@ -79,14 +79,17 @@ def page_index(container):
79
  orientation="horizontal",
80
  )
81
 
82
- corpus = input_funcs[selected_input][0](container)
 
 
 
83
 
84
  if len(corpus) > 0:
85
  index_results = None
86
  if st.button("Index"):
87
  index_results = index(
88
- corpus,
89
- st.session_state["pipeline"]["index_pipeline"],
90
  )
 
91
  if index_results:
92
  st.write(index_results)
 
79
  orientation="horizontal",
80
  )
81
 
82
+ clear_index = st.sidebar.checkbox("Clear Index", True)
83
+
84
+ doc_id = st.session_state["doc_id"]
85
+ corpus, doc_id = input_funcs[selected_input][0](container, doc_id)
86
 
87
  if len(corpus) > 0:
88
  index_results = None
89
  if st.button("Index"):
90
  index_results = index(
91
+ corpus, st.session_state["pipeline"]["index_pipeline"], clear_index
 
92
  )
93
+ st.session_state["doc_id"] = doc_id
94
  if index_results:
95
  st.write(index_results)
requirements.txt CHANGED
@@ -5,8 +5,4 @@ black==22.8.0
5
  plotly==5.10.0
6
  newspaper3k==0.2.8
7
  PyPDF2==2.10.7
8
- pytesseract==0.3.10
9
- soundfile==0.10.3.post1
10
- espnet
11
- pydub==0.25.1
12
- espnet_model_zoo==0.1.7
 
5
  plotly==5.10.0
6
  newspaper3k==0.2.8
7
  PyPDF2==2.10.7
8
+ pytesseract==0.3.10