prashant
commited on
Commit
•
49a314a
1
Parent(s):
f47e7d4
ver0.2 udfpreprocess update
Browse files- udfPreprocess/cleaning.py +16 -4
- udfPreprocess/docPreprocessing.py +6 -6
- udfPreprocess/paramconfig.cfg +12 -0
- udfPreprocess/sdg.py +57 -0
- udfPreprocess/search.py +145 -0
- udfPreprocess/uploadAndExample.py +48 -0
udfPreprocess/cleaning.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import pandas as pd
|
2 |
import numpy as np
|
3 |
import string
|
@@ -10,7 +11,7 @@ import streamlit as st
|
|
10 |
from haystack.nodes import PreProcessor
|
11 |
|
12 |
'''basic cleaning - suitable for transformer models'''
|
13 |
-
def basic(s):
|
14 |
"""
|
15 |
:param s: string to be processed
|
16 |
:return: processed string: see comments in the source code for more info
|
@@ -23,6 +24,15 @@ def basic(s):
|
|
23 |
# Remove URLs
|
24 |
s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
|
25 |
s = re.sub(r"http\S+", " ", s)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
# Remove new line characters
|
27 |
#s = re.sub('\n', ' ', s)
|
28 |
|
@@ -59,9 +69,10 @@ def preprocessingForSDG(document):
|
|
59 |
for i in document:
|
60 |
docs_processed = preprocessor.process([i])
|
61 |
for item in docs_processed:
|
62 |
-
item.content = basic(item.content)
|
63 |
|
64 |
-
st.
|
|
|
65 |
|
66 |
# create dataframe of text and list of all text
|
67 |
df = pd.DataFrame(docs_processed)
|
@@ -93,7 +104,8 @@ def preprocessing(document):
|
|
93 |
for item in docs_processed:
|
94 |
item.content = basic(item.content)
|
95 |
|
96 |
-
st.
|
|
|
97 |
|
98 |
# create dataframe of text and list of all text
|
99 |
df = pd.DataFrame(docs_processed)
|
|
|
1 |
+
import logging
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
import string
|
|
|
11 |
from haystack.nodes import PreProcessor
|
12 |
|
13 |
'''basic cleaning - suitable for transformer models'''
|
14 |
+
def basic(s,SDG = False):
|
15 |
"""
|
16 |
:param s: string to be processed
|
17 |
:return: processed string: see comments in the source code for more info
|
|
|
24 |
# Remove URLs
|
25 |
s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
|
26 |
s = re.sub(r"http\S+", " ", s)
|
27 |
+
if SDG == True:
|
28 |
+
s = s.lower()
|
29 |
+
translator = str.maketrans(' ', ' ', string.punctuation)
|
30 |
+
s = s.translate(translator)
|
31 |
+
s = re.sub('\n', ' ', s)
|
32 |
+
s = re.sub("\'", " ", s)
|
33 |
+
s = re.sub(r'\d+', ' ', s)
|
34 |
+
s = re.sub(r'\W+', ' ', s)
|
35 |
+
|
36 |
# Remove new line characters
|
37 |
#s = re.sub('\n', ' ', s)
|
38 |
|
|
|
69 |
for i in document:
|
70 |
docs_processed = preprocessor.process([i])
|
71 |
for item in docs_processed:
|
72 |
+
item.content = basic(item.content, SDG = True)
|
73 |
|
74 |
+
with st.spinner("👑 document being splitted into paragraphs"):
|
75 |
+
logging.info("document has been splitted to {} paragraphs".format(len(docs_processed)))
|
76 |
|
77 |
# create dataframe of text and list of all text
|
78 |
df = pd.DataFrame(docs_processed)
|
|
|
104 |
for item in docs_processed:
|
105 |
item.content = basic(item.content)
|
106 |
|
107 |
+
with st.spinner("👑 document being splitted into paragraphs"):
|
108 |
+
logging.info("document has been splitted to {} paragraphs".format(len(docs_processed)))
|
109 |
|
110 |
# create dataframe of text and list of all text
|
111 |
df = pd.DataFrame(docs_processed)
|
udfPreprocess/docPreprocessing.py
CHANGED
@@ -65,11 +65,11 @@ def load_document(
|
|
65 |
This can happen whith certain pdf types.'''
|
66 |
for i in documents:
|
67 |
if i.content == "":
|
68 |
-
st.
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
|
75 |
return documents
|
|
|
65 |
This can happen whith certain pdf types.'''
|
66 |
for i in documents:
|
67 |
if i.content == "":
|
68 |
+
with st.spinner("using pdfplumber"):
|
69 |
+
text = []
|
70 |
+
with pdfplumber.open(file_path) as pdf:
|
71 |
+
for page in pdf.pages:
|
72 |
+
text.append(page.extract_text())
|
73 |
+
i.content = ' '.join([page for page in text])
|
74 |
|
75 |
return documents
|
udfPreprocess/paramconfig.cfg
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[lexical_search]
|
2 |
+
TOP_K = 10
|
3 |
+
THRESHOLD = 0.1
|
4 |
+
|
5 |
+
[semantic_search]
|
6 |
+
TOP_K = 10
|
7 |
+
MAX_SEQ_LENGTH = 64
|
8 |
+
MODEL_NAME = msmarco-distilbert-cos-v5
|
9 |
+
THRESHOLD = 0.1
|
10 |
+
|
11 |
+
[sdg]
|
12 |
+
THRESHOLD = 0.85
|
udfPreprocess/sdg.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob, os, sys;
|
2 |
+
sys.path.append('../udfPreprocess')
|
3 |
+
|
4 |
+
#import helper
|
5 |
+
import udfPreprocess.docPreprocessing as pre
|
6 |
+
import udfPreprocess.cleaning as clean
|
7 |
+
|
8 |
+
#import needed libraries
|
9 |
+
import seaborn as sns
|
10 |
+
from pandas import DataFrame
|
11 |
+
from keybert import KeyBERT
|
12 |
+
from transformers import pipeline
|
13 |
+
import matplotlib.pyplot as plt
|
14 |
+
import numpy as np
|
15 |
+
import streamlit as st
|
16 |
+
import pandas as pd
|
17 |
+
import docx
|
18 |
+
from docx.shared import Inches
|
19 |
+
from docx.shared import Pt
|
20 |
+
from docx.enum.style import WD_STYLE_TYPE
|
21 |
+
|
22 |
+
import tempfile
|
23 |
+
import sqlite3
|
24 |
+
import logging
|
25 |
+
logger = logging.getLogger(__name__)
|
26 |
+
import configparser
|
27 |
+
|
28 |
+
@st.cache(allow_output_mutation=True)
|
29 |
+
def load_sdgClassifier():
|
30 |
+
classifier = pipeline("text-classification", model= "jonas/sdg_classifier_osdg")
|
31 |
+
logging.info("Loading classifier")
|
32 |
+
return classifier
|
33 |
+
|
34 |
+
def sdg_classification(par_list):
|
35 |
+
logging.info("running SDG classifiication")
|
36 |
+
config = configparser.ConfigParser()
|
37 |
+
config.read_file(open('udfPreprocess/paramconfig.cfg'))
|
38 |
+
threshold = float(config.get('sdg','THRESHOLD'))
|
39 |
+
|
40 |
+
|
41 |
+
classifier = load_sdgClassifier()
|
42 |
+
labels = classifier(par_list)
|
43 |
+
|
44 |
+
labels_= [(l['label'],l['score']) for l in labels]
|
45 |
+
# df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
|
46 |
+
df2 = DataFrame(labels_, columns=["SDG", "Relevancy"])
|
47 |
+
|
48 |
+
df2['text'] = par_list
|
49 |
+
df2 = df2.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
|
50 |
+
df2.index += 1
|
51 |
+
df2 =df2[df2['Relevancy']>threshold]
|
52 |
+
x = df2['SDG'].value_counts()
|
53 |
+
df3 = df2.copy()
|
54 |
+
df3= df3.drop(['Relevancy'], axis = 1)
|
55 |
+
|
56 |
+
|
57 |
+
return df3, x
|
udfPreprocess/search.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob, os, sys; sys.path.append('../udfPreprocess')
|
2 |
+
|
3 |
+
#import helper
|
4 |
+
import udfPreprocess.docPreprocessing as pre
|
5 |
+
import udfPreprocess.cleaning as clean
|
6 |
+
|
7 |
+
#import needed libraries
|
8 |
+
import seaborn as sns
|
9 |
+
from pandas import DataFrame
|
10 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
11 |
+
# from keybert import KeyBERT
|
12 |
+
from transformers import pipeline
|
13 |
+
import matplotlib.pyplot as plt
|
14 |
+
import numpy as np
|
15 |
+
import streamlit as st
|
16 |
+
import pandas as pd
|
17 |
+
from rank_bm25 import BM25Okapi
|
18 |
+
from sklearn.feature_extraction import _stop_words
|
19 |
+
import string
|
20 |
+
from tqdm.autonotebook import tqdm
|
21 |
+
import numpy as np
|
22 |
+
import docx
|
23 |
+
from docx.shared import Inches
|
24 |
+
from docx.shared import Pt
|
25 |
+
from docx.enum.style import WD_STYLE_TYPE
|
26 |
+
import logging
|
27 |
+
logger = logging.getLogger(__name__)
|
28 |
+
import tempfile
|
29 |
+
import sqlite3
|
30 |
+
import configparser
|
31 |
+
|
32 |
+
### These are lexcial search related functions/methods#####
|
33 |
+
|
34 |
+
def bm25_tokenizer(text):
|
35 |
+
tokenized_doc = []
|
36 |
+
for token in text.lower().split():
|
37 |
+
token = token.strip(string.punctuation)
|
38 |
+
|
39 |
+
if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
|
40 |
+
tokenized_doc.append(token)
|
41 |
+
return tokenized_doc
|
42 |
+
|
43 |
+
def bm25TokenizeDoc(paraList):
|
44 |
+
tokenized_corpus = []
|
45 |
+
##########Commenting this for now########### will incorporate paragrpah splitting later.
|
46 |
+
# for passage in tqdm(paraList):
|
47 |
+
# if len(passage.split()) >256:
|
48 |
+
# # st.write("Splitting")
|
49 |
+
# temp = " ".join(passage.split()[:256])
|
50 |
+
# tokenized_corpus.append(bm25_tokenizer(temp))
|
51 |
+
# temp = " ".join(passage.split()[256:])
|
52 |
+
# tokenized_corpus.append(bm25_tokenizer(temp))
|
53 |
+
# else:
|
54 |
+
# tokenized_corpus.append(bm25_tokenizer(passage))
|
55 |
+
######################################################################################33333
|
56 |
+
for passage in tqdm(paraList):
|
57 |
+
tokenized_corpus.append(bm25_tokenizer(passage))
|
58 |
+
|
59 |
+
return tokenized_corpus
|
60 |
+
|
61 |
+
def lexical_search(keyword, document_bm25):
|
62 |
+
config = configparser.ConfigParser()
|
63 |
+
config.read_file(open('udfPreprocess/paramconfig.cfg'))
|
64 |
+
top_k = int(config.get('lexical_search','TOP_K'))
|
65 |
+
bm25_scores = document_bm25.get_scores(bm25_tokenizer(keyword))
|
66 |
+
top_n = np.argpartition(bm25_scores, -top_k)[-top_k:]
|
67 |
+
bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
|
68 |
+
bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
|
69 |
+
return bm25_hits
|
70 |
+
|
71 |
+
@st.cache(allow_output_mutation=True)
|
72 |
+
def load_sentenceTransformer(name):
|
73 |
+
return SentenceTransformer(name)
|
74 |
+
|
75 |
+
|
76 |
+
def semantic_search(keywordlist,paraList):
|
77 |
+
|
78 |
+
##### Sematic Search #####
|
79 |
+
#query = "Does document contain {} issues ?".format(keyword)
|
80 |
+
config = configparser.ConfigParser()
|
81 |
+
config.read_file(open('udfPreprocess/paramconfig.cfg'))
|
82 |
+
model_name = config.get('semantic_search','MODEL_NAME')
|
83 |
+
|
84 |
+
bi_encoder = load_sentenceTransformer(model_name)
|
85 |
+
bi_encoder.max_seq_length = int(config.get('semantic_search','MAX_SEQ_LENGTH')) #Truncate long passages to 256 tokens
|
86 |
+
top_k = int(config.get('semantic_search','TOP_K'))
|
87 |
+
document_embeddings = bi_encoder.encode(paraList, convert_to_tensor=True, show_progress_bar=False)
|
88 |
+
question_embedding = bi_encoder.encode(keywordlist, convert_to_tensor=True)
|
89 |
+
|
90 |
+
hits = util.semantic_search(question_embedding, document_embeddings, top_k=top_k)
|
91 |
+
|
92 |
+
return hits
|
93 |
+
|
94 |
+
def show_results(keywordList):
|
95 |
+
document = docx.Document()
|
96 |
+
# document.add_heading('Document name:{}'.format(file_name), 2)
|
97 |
+
section = document.sections[0]
|
98 |
+
|
99 |
+
# Calling the footer
|
100 |
+
footer = section.footer
|
101 |
+
|
102 |
+
# Calling the paragraph already present in
|
103 |
+
# the footer section
|
104 |
+
footer_para = footer.paragraphs[0]
|
105 |
+
|
106 |
+
font_styles = document.styles
|
107 |
+
font_charstyle = font_styles.add_style('CommentsStyle', WD_STYLE_TYPE.CHARACTER)
|
108 |
+
font_object = font_charstyle.font
|
109 |
+
font_object.size = Pt(7)
|
110 |
+
# Adding the centered zoned footer
|
111 |
+
footer_para.add_run('''\tPowered by GIZ Data and the Sustainable Development Solution Network hosted at Hugging-Face spaces: https://huggingface.co/spaces/ppsingh/streamlit_dev''', style='CommentsStyle')
|
112 |
+
document.add_heading('Your Seacrhed for {}'.format(keywordList), level=1)
|
113 |
+
for keyword in keywordList:
|
114 |
+
|
115 |
+
st.write("Results for Query: {}".format(keyword))
|
116 |
+
para = document.add_paragraph().add_run("Results for Query: {}".format(keyword))
|
117 |
+
para.font.size = Pt(12)
|
118 |
+
bm25_hits, hits = search(keyword)
|
119 |
+
|
120 |
+
st.markdown("""
|
121 |
+
We will provide with 2 kind of results. The 'lexical search' and the semantic search.
|
122 |
+
""")
|
123 |
+
# In the semantic search part we provide two kind of results one with only Retriever (Bi-Encoder) and other the ReRanker (Cross Encoder)
|
124 |
+
st.markdown("Top few lexical search (BM25) hits")
|
125 |
+
document.add_paragraph("Top few lexical search (BM25) hits")
|
126 |
+
|
127 |
+
for hit in bm25_hits[0:5]:
|
128 |
+
if hit['score'] > 0.00:
|
129 |
+
st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
130 |
+
document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
131 |
+
|
132 |
+
|
133 |
+
|
134 |
+
# st.table(bm25_hits[0:3])
|
135 |
+
|
136 |
+
st.markdown("\n-------------------------\n")
|
137 |
+
st.markdown("Top few Bi-Encoder Retrieval hits")
|
138 |
+
document.add_paragraph("\n-------------------------\n")
|
139 |
+
document.add_paragraph("Top few Bi-Encoder Retrieval hits")
|
140 |
+
|
141 |
+
hits = sorted(hits, key=lambda x: x['score'], reverse=True)
|
142 |
+
for hit in hits[0:5]:
|
143 |
+
# if hit['score'] > 0.45:
|
144 |
+
st.write("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
145 |
+
document.add_paragraph("\t Score: {:.3f}: \t{}".format(hit['score'], paraList[hit['corpus_id']].replace("\n", " ")))
|
udfPreprocess/uploadAndExample.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import tempfile
|
3 |
+
import udfPreprocess.docPreprocessing as pre
|
4 |
+
import udfPreprocess.cleaning as clean
|
5 |
+
|
6 |
+
def add_upload(choice):
|
7 |
+
|
8 |
+
|
9 |
+
if choice == 'Upload Document':
|
10 |
+
uploaded_file = st.sidebar.file_uploader('Upload the File', type=['pdf', 'docx', 'txt'])
|
11 |
+
if uploaded_file is not None:
|
12 |
+
with tempfile.NamedTemporaryFile(mode="wb") as temp:
|
13 |
+
bytes_data = uploaded_file.getvalue()
|
14 |
+
temp.write(bytes_data)
|
15 |
+
st.session_state['filename'] = uploaded_file.name
|
16 |
+
# st.write("Uploaded Filename: ", uploaded_file.name)
|
17 |
+
file_name = uploaded_file.name
|
18 |
+
file_path = temp.name
|
19 |
+
docs = pre.load_document(file_path, file_name)
|
20 |
+
haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
21 |
+
st.session_state['docs'] = docs
|
22 |
+
st.session_state['paraList'] = paraList
|
23 |
+
|
24 |
+
|
25 |
+
else:
|
26 |
+
# listing the options
|
27 |
+
option = st.sidebar.selectbox('Select the example document',
|
28 |
+
('South Africa:Low Emission strategy',
|
29 |
+
'Ethiopia: 10 Year Development Plan'))
|
30 |
+
if option is 'South Africa:Low Emission strategy':
|
31 |
+
file_name = file_path = 'sample/South Africa_s Low Emission Development Strategy.txt'
|
32 |
+
st.session_state['filename'] = file_name
|
33 |
+
# st.write("Selected document:", file_name.split('/')[1])
|
34 |
+
# with open('sample/South Africa_s Low Emission Development Strategy.txt') as dfile:
|
35 |
+
# file = open('sample/South Africa_s Low Emission Development Strategy.txt', 'wb')
|
36 |
+
else:
|
37 |
+
# with open('sample/Ethiopia_s_2021_10 Year Development Plan.txt') as dfile:
|
38 |
+
file_name = file_path = 'sample/Ethiopia_s_2021_10 Year Development Plan.txt'
|
39 |
+
st.session_state['filename'] = file_name
|
40 |
+
# st.write("Selected document:", file_name.split('/')[1])
|
41 |
+
|
42 |
+
if option is not None:
|
43 |
+
docs = pre.load_document(file_path,file_name)
|
44 |
+
haystackDoc, dataframeDoc, textData, paraList = clean.preprocessing(docs)
|
45 |
+
st.session_state['docs'] = docs
|
46 |
+
st.session_state['paraList'] = paraList
|
47 |
+
|
48 |
+
|