prashant
commited on
Commit
•
f9949bb
1
Parent(s):
fb4cce0
lexcial update
Browse files- appStore/keyword_search.py +21 -19
- utils/lexical_search.py +88 -44
- utils/sdg_classifier.py +2 -2
- utils/streamlitcheck.py +19 -0
appStore/keyword_search.py
CHANGED
@@ -47,12 +47,9 @@ def app():
|
|
47 |
else:
|
48 |
keywordList = None
|
49 |
|
50 |
-
searchtype = st.selectbox("Do you want to find exact macthes or similar
|
|
|
51 |
['Exact Matches', 'Similar context/meaning'])
|
52 |
-
# if searchtype == 'Similar context/meaning':
|
53 |
-
# show_answers = st.sidebar.checkbox("Show context")
|
54 |
-
|
55 |
-
|
56 |
|
57 |
|
58 |
with st.container():
|
@@ -61,33 +58,38 @@ def app():
|
|
61 |
will look for these keywords in document".format(genre),
|
62 |
value="{}".format(keywordList))
|
63 |
else:
|
64 |
-
queryList = st.text_input("Please enter here your question and we
|
65 |
-
for an answer in the document
|
66 |
-
|
67 |
-
we will look for similar
|
68 |
-
in the document.",
|
69 |
placeholder="Enter keyword here")
|
70 |
|
71 |
if st.button("Find them"):
|
72 |
|
73 |
if queryList == "":
|
74 |
-
st.info("🤔 No keyword provided, if you dont have any,
|
|
|
75 |
logging.warning("Terminated as no keyword provided")
|
76 |
else:
|
77 |
if 'filepath' in st.session_state:
|
78 |
|
|
|
79 |
if searchtype == 'Exact Matches':
|
80 |
-
|
|
|
|
|
81 |
logging.info("performing lexical search")
|
82 |
-
with st.spinner("Performing Exact matching search
|
|
|
83 |
st.markdown("##### Top few lexical search (TFIDF) hits #####")
|
84 |
-
lexical_search(queryList,
|
85 |
else:
|
86 |
-
|
87 |
-
paraList = runSemanticPreprocessingPipeline()
|
88 |
-
logging.info("starting semantic search")
|
89 |
-
with st.spinner("Performing Similar/Contextual search"):
|
90 |
-
|
91 |
|
92 |
else:
|
93 |
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
|
|
47 |
else:
|
48 |
keywordList = None
|
49 |
|
50 |
+
searchtype = st.selectbox("Do you want to find exact macthes or similar \
|
51 |
+
meaning/context",
|
52 |
['Exact Matches', 'Similar context/meaning'])
|
|
|
|
|
|
|
|
|
53 |
|
54 |
|
55 |
with st.container():
|
|
|
58 |
will look for these keywords in document".format(genre),
|
59 |
value="{}".format(keywordList))
|
60 |
else:
|
61 |
+
queryList = st.text_input("Please enter here your question and we \
|
62 |
+
will look for an answer in the document\
|
63 |
+
OR enter the keyword you are looking \
|
64 |
+
for and we will we will look for similar\
|
65 |
+
context in the document.",
|
66 |
placeholder="Enter keyword here")
|
67 |
|
68 |
if st.button("Find them"):
|
69 |
|
70 |
if queryList == "":
|
71 |
+
st.info("🤔 No keyword provided, if you dont have any, \
|
72 |
+
please try example sets from sidebar!")
|
73 |
logging.warning("Terminated as no keyword provided")
|
74 |
else:
|
75 |
if 'filepath' in st.session_state:
|
76 |
|
77 |
+
|
78 |
if searchtype == 'Exact Matches':
|
79 |
+
allDocuments = runLexicalPreprocessingPipeline(
|
80 |
+
st.session_state['filepath'],
|
81 |
+
st.session_state['filename'])
|
82 |
logging.info("performing lexical search")
|
83 |
+
with st.spinner("Performing Exact matching search \
|
84 |
+
(Lexical search) for you"):
|
85 |
st.markdown("##### Top few lexical search (TFIDF) hits #####")
|
86 |
+
lexical_search(queryList,allDocuments['documents'])
|
87 |
else:
|
88 |
+
pass
|
89 |
+
# paraList = runSemanticPreprocessingPipeline()
|
90 |
+
# logging.info("starting semantic search")
|
91 |
+
# with st.spinner("Performing Similar/Contextual search"):
|
92 |
+
# semantic_search(queryList,paraList)
|
93 |
|
94 |
else:
|
95 |
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
utils/lexical_search.py
CHANGED
@@ -1,20 +1,67 @@
|
|
1 |
from haystack.nodes import TfidfRetriever
|
2 |
-
from haystack.nodes.base import BaseComponent
|
3 |
from haystack.document_stores import InMemoryDocumentStore
|
4 |
-
import configparser
|
5 |
import spacy
|
6 |
import re
|
7 |
from spacy.matcher import Matcher
|
|
|
8 |
import streamlit as st
|
9 |
from markdown import markdown
|
10 |
from annotated_text import annotation
|
11 |
from haystack.schema import Document
|
12 |
from typing import List, Text
|
13 |
from utils.preprocessing import processingpipeline
|
14 |
-
from
|
|
|
|
|
15 |
|
|
|
|
|
|
|
|
|
16 |
config = configparser.ConfigParser()
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
|
20 |
def tokenize_lexical_query(query:str)-> List[str]:
|
@@ -100,61 +147,56 @@ def runRegexMatcher(token_list:List[str], document:Text):
|
|
100 |
|
101 |
return matches, document
|
102 |
|
103 |
-
def
|
104 |
"""
|
|
|
105 |
Annotates the text in the document defined by list of [start index, end index]
|
106 |
Example: "How are you today", if document type is text, matches = [[0,3]]
|
107 |
will give answer = "How", however in case we used the spacy matcher then the
|
108 |
matches = [[0,3]] will give answer = "How are you". However if spacy is used
|
109 |
to find "How" then the matches = [[0,1]] for the string defined above.
|
110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
"""
|
112 |
start = 0
|
113 |
annotated_text = ""
|
114 |
for match in matches:
|
115 |
start_idx = match[0]
|
116 |
end_idx = match[1]
|
117 |
-
|
118 |
-
|
119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
start = end_idx
|
121 |
|
122 |
annotated_text = annotated_text + document[end_idx:].text
|
123 |
-
|
124 |
-
st.write(
|
125 |
-
markdown(annotated_text),
|
126 |
-
unsafe_allow_html=True,
|
127 |
-
)
|
128 |
|
129 |
-
def runLexicalPreprocessingPipeline()->List[Document]:
|
130 |
-
"""
|
131 |
-
creates the pipeline and runs the preprocessing pipeline,
|
132 |
-
the params for pipeline are fetched from paramconfig
|
133 |
|
134 |
-
|
135 |
-
--------------
|
136 |
-
List[Document]: When preprocessing pipeline is run, the output dictionary
|
137 |
-
has four objects. For the lexicaal search using TFIDFRetriever we
|
138 |
-
need to use the List of Haystack Document, which can be fetched by
|
139 |
-
key = 'documents' on output.
|
140 |
-
|
141 |
-
"""
|
142 |
-
file_path = st.session_state['filepath']
|
143 |
-
file_name = st.session_state['filename']
|
144 |
-
lexical_processing_pipeline = processingpipeline()
|
145 |
-
split_by = config.get('lexical_search','SPLIT_BY')
|
146 |
-
split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
|
147 |
-
split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
|
148 |
-
|
149 |
-
output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
|
150 |
-
params= {"FileConverter": {"file_path": file_path, \
|
151 |
-
"file_name": file_name},
|
152 |
-
"UdfPreProcessor": {"removePunc": False, \
|
153 |
-
"split_by": split_by, \
|
154 |
-
"split_length":split_length,\
|
155 |
-
"split_overlap": split_overlap}})
|
156 |
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
def lexical_search(query:Text,documents:List[Document]):
|
160 |
"""
|
@@ -164,7 +206,7 @@ def lexical_search(query:Text,documents:List[Document]):
|
|
164 |
Params
|
165 |
-------
|
166 |
query: Keywords that need to be searche in documents.
|
167 |
-
documents: List
|
168 |
|
169 |
"""
|
170 |
|
@@ -177,9 +219,11 @@ def lexical_search(query:Text,documents:List[Document]):
|
|
177 |
top_k= int(config.get('lexical_search','TOP_K')))
|
178 |
query_tokens = tokenize_lexical_query(query)
|
179 |
for count, result in enumerate(results):
|
180 |
-
# if result.content != "":
|
181 |
matches, doc = runSpacyMatcher(query_tokens,result.content)
|
182 |
if len(matches) != 0:
|
183 |
-
|
184 |
-
|
|
|
|
|
|
|
185 |
|
|
|
1 |
from haystack.nodes import TfidfRetriever
|
|
|
2 |
from haystack.document_stores import InMemoryDocumentStore
|
|
|
3 |
import spacy
|
4 |
import re
|
5 |
from spacy.matcher import Matcher
|
6 |
+
from termcolor import colored
|
7 |
import streamlit as st
|
8 |
from markdown import markdown
|
9 |
from annotated_text import annotation
|
10 |
from haystack.schema import Document
|
11 |
from typing import List, Text
|
12 |
from utils.preprocessing import processingpipeline
|
13 |
+
from utils.streamlitcheck import check_streamlit
|
14 |
+
import configparser
|
15 |
+
import logging
|
16 |
|
17 |
+
try:
|
18 |
+
import streamlit as st
|
19 |
+
except ImportError:
|
20 |
+
logging.info("Streamlit not installed")
|
21 |
config = configparser.ConfigParser()
|
22 |
+
try:
|
23 |
+
config.read_file(open('paramconfig.cfg'))
|
24 |
+
except Exception:
|
25 |
+
logging.info("paramconfig file not found")
|
26 |
+
st.info("Please place the paramconfig file in the same directory as app.py")
|
27 |
+
|
28 |
+
|
29 |
+
def runLexicalPreprocessingPipeline(file_path, file_name)->List[Document]:
|
30 |
+
"""
|
31 |
+
creates the pipeline and runs the preprocessing pipeline,
|
32 |
+
the params for pipeline are fetched from paramconfig
|
33 |
+
|
34 |
+
Params
|
35 |
+
------------
|
36 |
+
|
37 |
+
file_name: filename, in case of streamlit application use
|
38 |
+
st.session_state['filename']
|
39 |
+
file_path: filepath, in case of streamlit application use
|
40 |
+
st.session_state['filepath']
|
41 |
+
|
42 |
+
Return
|
43 |
+
--------------
|
44 |
+
List[Document]: When preprocessing pipeline is run, the output dictionary
|
45 |
+
has four objects. For the lexicaal search using TFIDFRetriever we
|
46 |
+
need to use the List of Haystack Document, which can be fetched by
|
47 |
+
key = 'documents' on output.
|
48 |
+
|
49 |
+
"""
|
50 |
+
|
51 |
+
lexical_processing_pipeline = processingpipeline()
|
52 |
+
split_by = config.get('lexical_search','SPLIT_BY')
|
53 |
+
split_length = int(config.get('lexical_search','SPLIT_LENGTH'))
|
54 |
+
split_overlap = int(config.get('lexical_search','SPLIT_OVERLAP'))
|
55 |
+
|
56 |
+
output_lexical_pre = lexical_processing_pipeline.run(file_paths = file_path,
|
57 |
+
params= {"FileConverter": {"file_path": file_path, \
|
58 |
+
"file_name": file_name},
|
59 |
+
"UdfPreProcessor": {"removePunc": False, \
|
60 |
+
"split_by": split_by, \
|
61 |
+
"split_length":split_length,\
|
62 |
+
"split_overlap": split_overlap}})
|
63 |
+
|
64 |
+
return output_lexical_pre
|
65 |
|
66 |
|
67 |
def tokenize_lexical_query(query:str)-> List[str]:
|
|
|
147 |
|
148 |
return matches, document
|
149 |
|
150 |
+
def spacyAnnotator(matches: List[List[int]], document:spacy.token.doc.Doc):
|
151 |
"""
|
152 |
+
This is spacy Annotator and needs spacy.doc
|
153 |
Annotates the text in the document defined by list of [start index, end index]
|
154 |
Example: "How are you today", if document type is text, matches = [[0,3]]
|
155 |
will give answer = "How", however in case we used the spacy matcher then the
|
156 |
matches = [[0,3]] will give answer = "How are you". However if spacy is used
|
157 |
to find "How" then the matches = [[0,1]] for the string defined above.
|
158 |
|
159 |
+
Params
|
160 |
+
-----------
|
161 |
+
matches: As mentioned its list of list. Example [[0,1],[10,13]]
|
162 |
+
document: document which needs to be indexed.
|
163 |
+
|
164 |
+
|
165 |
+
Return
|
166 |
+
--------
|
167 |
+
will send the output to either app front end using streamlit or
|
168 |
+
write directly to output screen.
|
169 |
+
|
170 |
"""
|
171 |
start = 0
|
172 |
annotated_text = ""
|
173 |
for match in matches:
|
174 |
start_idx = match[0]
|
175 |
end_idx = match[1]
|
176 |
+
|
177 |
+
if check_streamlit():
|
178 |
+
annotated_text = (annotated_text + document[start:start_idx].text
|
179 |
+
+ str(annotation(body=document[start_idx:end_idx].text,
|
180 |
+
label="ANSWER", background="#964448", color='#ffffff')))
|
181 |
+
else:
|
182 |
+
annotated_text = (annotated_text + document[start:start_idx].text
|
183 |
+
+ colored(document[start_idx:end_idx].text,
|
184 |
+
"green", attrs = ['bold']))
|
185 |
+
|
186 |
+
|
187 |
start = end_idx
|
188 |
|
189 |
annotated_text = annotated_text + document[end_idx:].text
|
|
|
|
|
|
|
|
|
|
|
190 |
|
|
|
|
|
|
|
|
|
191 |
|
192 |
+
if check_streamlit():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
|
194 |
+
st.write(
|
195 |
+
markdown(annotated_text),
|
196 |
+
unsafe_allow_html=True,
|
197 |
+
)
|
198 |
+
else:
|
199 |
+
print(annotated_text)
|
200 |
|
201 |
def lexical_search(query:Text,documents:List[Document]):
|
202 |
"""
|
|
|
206 |
Params
|
207 |
-------
|
208 |
query: Keywords that need to be searche in documents.
|
209 |
+
documents: List of Haystack documents returned by preprocessing pipeline.
|
210 |
|
211 |
"""
|
212 |
|
|
|
219 |
top_k= int(config.get('lexical_search','TOP_K')))
|
220 |
query_tokens = tokenize_lexical_query(query)
|
221 |
for count, result in enumerate(results):
|
|
|
222 |
matches, doc = runSpacyMatcher(query_tokens,result.content)
|
223 |
if len(matches) != 0:
|
224 |
+
if check_streamlit():
|
225 |
+
st.write("Result {}".format(count+1))
|
226 |
+
else:
|
227 |
+
print("Results {}".format(count +1))
|
228 |
+
spacyAnnotator(matches, doc)
|
229 |
|
utils/sdg_classifier.py
CHANGED
@@ -56,7 +56,7 @@ def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
|
|
56 |
the number of times it is covered/discussed/count_of_paragraphs.
|
57 |
|
58 |
"""
|
59 |
-
logging.info("
|
60 |
threshold = float(config.get('sdg','THRESHOLD'))
|
61 |
|
62 |
|
@@ -83,7 +83,7 @@ def runSDGPreprocessingPipeline(file_path, file_name)->List[Document]:
|
|
83 |
creates the pipeline and runs the preprocessing pipeline,
|
84 |
the params for pipeline are fetched from paramconfig
|
85 |
|
86 |
-
|
87 |
------------
|
88 |
|
89 |
file_name: filename, in case of streamlit application use
|
|
|
56 |
the number of times it is covered/discussed/count_of_paragraphs.
|
57 |
|
58 |
"""
|
59 |
+
logging.info("Working on SDG Classification")
|
60 |
threshold = float(config.get('sdg','THRESHOLD'))
|
61 |
|
62 |
|
|
|
83 |
creates the pipeline and runs the preprocessing pipeline,
|
84 |
the params for pipeline are fetched from paramconfig
|
85 |
|
86 |
+
Params
|
87 |
------------
|
88 |
|
89 |
file_name: filename, in case of streamlit application use
|
utils/streamlitcheck.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def check_streamlit():
|
2 |
+
"""
|
3 |
+
Function to check whether python code is run within streamlit
|
4 |
+
|
5 |
+
Returns
|
6 |
+
-------
|
7 |
+
use_streamlit : boolean
|
8 |
+
True if code is run within streamlit, else False
|
9 |
+
"""
|
10 |
+
try:
|
11 |
+
from streamlit.scriptrunner.script_run_context import get_script_run_ctx
|
12 |
+
if not get_script_run_ctx():
|
13 |
+
use_streamlit = False
|
14 |
+
else:
|
15 |
+
use_streamlit = True
|
16 |
+
except ModuleNotFoundError:
|
17 |
+
use_streamlit = False
|
18 |
+
return use_streamlit
|
19 |
+
|