prashant
commited on
Commit
•
f59362a
1
Parent(s):
c1078c4
hashing fix
Browse files- appStore/sdg_analysis.py +3 -3
- utils/keyword_extraction.py +1 -0
- utils/sdg_classifier.py +37 -5
appStore/sdg_analysis.py
CHANGED
@@ -91,7 +91,8 @@ def app():
|
|
91 |
if 'filepath' in st.session_state:
|
92 |
file_name = st.session_state['filename']
|
93 |
file_path = st.session_state['filepath']
|
94 |
-
classifier = load_sdgClassifier(
|
|
|
95 |
allDocuments = runSDGPreprocessingPipeline(fileName= file_name,
|
96 |
filePath= file_path, split_by= split_by,
|
97 |
split_length= split_length,
|
@@ -107,8 +108,7 @@ def app():
|
|
107 |
with st.spinner("Running SDG Classification{}".format(warning_msg)):
|
108 |
|
109 |
df, x = sdg_classification(haystackdoc=allDocuments['documents'],
|
110 |
-
threshold= threshold
|
111 |
-
classifiermodel= classifier)
|
112 |
df = df.drop(['Relevancy'], axis = 1)
|
113 |
sdg_labels = x.SDG.unique()[::-1]
|
114 |
textrankkeywordlist = []
|
|
|
91 |
if 'filepath' in st.session_state:
|
92 |
file_name = st.session_state['filename']
|
93 |
file_path = st.session_state['filepath']
|
94 |
+
classifier = load_sdgClassifier(classifier_name=model_name)
|
95 |
+
st.session_state['sdg_classifier'] = classifier
|
96 |
allDocuments = runSDGPreprocessingPipeline(fileName= file_name,
|
97 |
filePath= file_path, split_by= split_by,
|
98 |
split_length= split_length,
|
|
|
108 |
with st.spinner("Running SDG Classification{}".format(warning_msg)):
|
109 |
|
110 |
df, x = sdg_classification(haystackdoc=allDocuments['documents'],
|
111 |
+
threshold= threshold)
|
|
|
112 |
df = df.drop(['Relevancy'], axis = 1)
|
113 |
sdg_labels = x.SDG.unique()[::-1]
|
114 |
textrankkeywordlist = []
|
utils/keyword_extraction.py
CHANGED
@@ -107,6 +107,7 @@ def keywordExtraction(sdg:int,sdgdata:List[Text]):
|
|
107 |
keywords = [keyword for keyword in results]
|
108 |
return keywords
|
109 |
|
|
|
110 |
def textrank(textdata:Text, ratio:float = 0.1, words = 0):
|
111 |
"""
|
112 |
wrappper function to perform textrank, uses either ratio or wordcount to
|
|
|
107 |
keywords = [keyword for keyword in results]
|
108 |
return keywords
|
109 |
|
110 |
+
@st.cache(allow_output_mutation=True)
|
111 |
def textrank(textdata:Text, ratio:float = 0.1, words = 0):
|
112 |
"""
|
113 |
wrappper function to perform textrank, uses either ratio or wordcount to
|
utils/sdg_classifier.py
CHANGED
@@ -7,14 +7,35 @@ import logging
|
|
7 |
import pandas as pd
|
8 |
from pandas import DataFrame, Series
|
9 |
from utils.checkconfig import getconfig
|
|
|
10 |
from utils.preprocessing import processingpipeline
|
11 |
try:
|
12 |
import streamlit as st
|
13 |
except ImportError:
|
14 |
logging.info("Streamlit not installed")
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
@st.cache(allow_output_mutation=True)
|
17 |
-
def load_sdgClassifier(configFile = None,
|
18 |
"""
|
19 |
loads the document classifier using haystack, where the name/path of model
|
20 |
in HF-hub as string is used to fetch the model object.Either configfile or
|
@@ -31,17 +52,17 @@ def load_sdgClassifier(configFile = None, docClassifierModel = None):
|
|
31 |
|
32 |
Return: document classifier model
|
33 |
"""
|
34 |
-
if not
|
35 |
if not configFile:
|
36 |
logging.warning("Pass either model name or config file")
|
37 |
return
|
38 |
else:
|
39 |
config = getconfig(configFile)
|
40 |
-
|
41 |
|
42 |
logging.info("Loading classifier")
|
43 |
doc_classifier = TransformersDocumentClassifier(
|
44 |
-
model_name_or_path=
|
45 |
task="text-classification")
|
46 |
|
47 |
return doc_classifier
|
@@ -49,7 +70,7 @@ def load_sdgClassifier(configFile = None, docClassifierModel = None):
|
|
49 |
|
50 |
@st.cache(allow_output_mutation=True)
|
51 |
def sdg_classification(haystackdoc:List[Document],
|
52 |
-
threshold:float, classifiermodel)->Tuple[DataFrame,Series]:
|
53 |
"""
|
54 |
Text-Classification on the list of texts provided. Classifier provides the
|
55 |
most appropriate label for each text. these labels are in terms of if text
|
@@ -60,6 +81,10 @@ def sdg_classification(haystackdoc:List[Document],
|
|
60 |
haystackdoc: List of haystack Documents. The output of Preprocessing Pipeline
|
61 |
contains the list of paragraphs in different format,here the list of
|
62 |
Haystack Documents is used.
|
|
|
|
|
|
|
|
|
63 |
|
64 |
Returns
|
65 |
----------
|
@@ -69,6 +94,13 @@ def sdg_classification(haystackdoc:List[Document],
|
|
69 |
|
70 |
"""
|
71 |
logging.info("Working on SDG Classification")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
results = classifiermodel.predict(haystackdoc)
|
73 |
|
74 |
|
|
|
7 |
import pandas as pd
|
8 |
from pandas import DataFrame, Series
|
9 |
from utils.checkconfig import getconfig
|
10 |
+
from utils.streamlitcheck import check_streamlit
|
11 |
from utils.preprocessing import processingpipeline
|
12 |
try:
|
13 |
import streamlit as st
|
14 |
except ImportError:
|
15 |
logging.info("Streamlit not installed")
|
16 |
|
17 |
+
## Labels dictionary ###
|
18 |
+
_lab_dict = {0: 'no_cat',
|
19 |
+
1:'SDG 1 - No poverty',
|
20 |
+
2:'SDG 2 - Zero hunger',
|
21 |
+
3:'SDG 3 - Good health and well-being',
|
22 |
+
4:'SDG 4 - Quality education',
|
23 |
+
5:'SDG 5 - Gender equality',
|
24 |
+
6:'SDG 6 - Clean water and sanitation',
|
25 |
+
7:'SDG 7 - Affordable and clean energy',
|
26 |
+
8:'SDG 8 - Decent work and economic growth',
|
27 |
+
9:'SDG 9 - Industry, Innovation and Infrastructure',
|
28 |
+
10:'SDG 10 - Reduced inequality',
|
29 |
+
11:'SDG 11 - Sustainable cities and communities',
|
30 |
+
12:'SDG 12 - Responsible consumption and production',
|
31 |
+
13:'SDG 13 - Climate action',
|
32 |
+
14:'SDG 14 - Life below water',
|
33 |
+
15:'SDG 15 - Life on land',
|
34 |
+
16:'SDG 16 - Peace, justice and strong institutions',
|
35 |
+
17:'SDG 17 - Partnership for the goals',}
|
36 |
+
|
37 |
@st.cache(allow_output_mutation=True)
|
38 |
+
def load_sdgClassifier(configFile = None, classifier_name = None):
|
39 |
"""
|
40 |
loads the document classifier using haystack, where the name/path of model
|
41 |
in HF-hub as string is used to fetch the model object.Either configfile or
|
|
|
52 |
|
53 |
Return: document classifier model
|
54 |
"""
|
55 |
+
if not classifier_name:
|
56 |
if not configFile:
|
57 |
logging.warning("Pass either model name or config file")
|
58 |
return
|
59 |
else:
|
60 |
config = getconfig(configFile)
|
61 |
+
classifier_name = config.get('sdg','MODEL')
|
62 |
|
63 |
logging.info("Loading classifier")
|
64 |
doc_classifier = TransformersDocumentClassifier(
|
65 |
+
model_name_or_path=classifier_name,
|
66 |
task="text-classification")
|
67 |
|
68 |
return doc_classifier
|
|
|
70 |
|
71 |
@st.cache(allow_output_mutation=True)
|
72 |
def sdg_classification(haystackdoc:List[Document],
|
73 |
+
threshold:float, classifiermodel= None)->Tuple[DataFrame,Series]:
|
74 |
"""
|
75 |
Text-Classification on the list of texts provided. Classifier provides the
|
76 |
most appropriate label for each text. these labels are in terms of if text
|
|
|
81 |
haystackdoc: List of haystack Documents. The output of Preprocessing Pipeline
|
82 |
contains the list of paragraphs in different format,here the list of
|
83 |
Haystack Documents is used.
|
84 |
+
threshold: threshold value for the model to keep the results from classifier
|
85 |
+
classifiermodel: you can pass the classifier model directly, however in case of
|
86 |
+
streamlit avoid it.
|
87 |
+
|
88 |
|
89 |
Returns
|
90 |
----------
|
|
|
94 |
|
95 |
"""
|
96 |
logging.info("Working on SDG Classification")
|
97 |
+
if not classifiermodel:
|
98 |
+
if check_streamlit:
|
99 |
+
classifiermodel = st.session_state['sdg_classifier']
|
100 |
+
else:
|
101 |
+
logging.warning("No streamlit envinornment found, Pass the classifier")
|
102 |
+
return
|
103 |
+
|
104 |
results = classifiermodel.predict(haystackdoc)
|
105 |
|
106 |
|