prashant
commited on
Commit
•
40debb1
1
Parent(s):
a3c251d
trying streamlit-aggrid
Browse files- appStore/sdg_analysis.py +31 -16
- paramconfig.cfg +1 -0
- requirements.txt +1 -0
- utils/keyword_extraction.py +6 -1
- utils/sdg_classifier.py +24 -0
appStore/sdg_analysis.py
CHANGED
@@ -12,6 +12,7 @@ import docx
|
|
12 |
from docx.shared import Inches
|
13 |
from docx.shared import Pt
|
14 |
from docx.enum.style import WD_STYLE_TYPE
|
|
|
15 |
from utils.sdg_classifier import sdg_classification
|
16 |
from utils.sdg_classifier import runSDGPreprocessingPipeline
|
17 |
from utils.keyword_extraction import keywordExtraction, textrank
|
@@ -22,6 +23,7 @@ logger = logging.getLogger(__name__)
|
|
22 |
|
23 |
def app():
|
24 |
|
|
|
25 |
with st.container():
|
26 |
st.markdown("<h2 style='text-align: center; color: black;'> SDG Classification and Keyphrase Extraction </h2>", unsafe_allow_html=True)
|
27 |
st.write(' ')
|
@@ -72,7 +74,25 @@ def app():
|
|
72 |
""")
|
73 |
st.markdown("")
|
74 |
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
with st.container():
|
77 |
if st.button("RUN SDG Analysis"):
|
78 |
|
@@ -90,15 +110,15 @@ def app():
|
|
90 |
|
91 |
df, x = sdg_classification(allDocuments['documents'])
|
92 |
sdg_labels = df.SDG.unique()
|
93 |
-
# tfidfkeywordList = []
|
94 |
textrankkeywordlist = []
|
95 |
for label in sdg_labels:
|
96 |
sdgdata = " ".join(df[df.SDG == label].text.to_list())
|
97 |
# tfidflist_ = keywordExtraction(label,[sdgdata])
|
98 |
-
textranklist_ = textrank(sdgdata
|
99 |
-
|
100 |
-
|
101 |
-
|
|
|
102 |
tRkeywordsDf = pd.DataFrame(textrankkeywordlist)
|
103 |
|
104 |
|
@@ -106,9 +126,9 @@ def app():
|
|
106 |
colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
|
107 |
# plot
|
108 |
fig, ax = plt.subplots()
|
109 |
-
ax.pie(x, colors=colors, radius=
|
110 |
wedgeprops={"linewidth": 1, "edgecolor": "white"},
|
111 |
-
frame=False,labels =list(x.
|
112 |
# fig.savefig('temp.png', bbox_inches='tight',dpi= 100)
|
113 |
|
114 |
|
@@ -120,20 +140,15 @@ def app():
|
|
120 |
st.pyplot(fig)
|
121 |
|
122 |
st.markdown("##### What keywords are present under SDG classified text? #####")
|
123 |
-
st.write("TFIDF BASED")
|
124 |
|
125 |
c1, c2, c3 = st.columns([1, 10, 1])
|
126 |
with c2:
|
127 |
-
st.table(
|
128 |
-
|
129 |
-
st.write("TextRank BASED")
|
130 |
|
131 |
-
|
132 |
-
with c12:
|
133 |
-
st.table(tRkeywordsDf)
|
134 |
c7, c8, c9 = st.columns([1, 10, 1])
|
135 |
with c8:
|
136 |
-
|
137 |
else:
|
138 |
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
139 |
logging.warning("Terminated as no document provided")
|
|
|
12 |
from docx.shared import Inches
|
13 |
from docx.shared import Pt
|
14 |
from docx.enum.style import WD_STYLE_TYPE
|
15 |
+
from st_aggrid import AgGrid
|
16 |
from utils.sdg_classifier import sdg_classification
|
17 |
from utils.sdg_classifier import runSDGPreprocessingPipeline
|
18 |
from utils.keyword_extraction import keywordExtraction, textrank
|
|
|
23 |
|
24 |
def app():
|
25 |
|
26 |
+
#### APP INFO #####
|
27 |
with st.container():
|
28 |
st.markdown("<h2 style='text-align: center; color: black;'> SDG Classification and Keyphrase Extraction </h2>", unsafe_allow_html=True)
|
29 |
st.write(' ')
|
|
|
74 |
""")
|
75 |
st.markdown("")
|
76 |
|
77 |
+
_lab_dict = {0: 'no_cat',
|
78 |
+
1:'SDG 1 - No poverty',
|
79 |
+
2:'SDG 2 - Zero hunger',
|
80 |
+
3:'SDG 3 - Good health and well-being',
|
81 |
+
4:'SDG 4 - Quality education',
|
82 |
+
5:'SDG 5 - Gender equality',
|
83 |
+
6:'SDG 6 - Clean water and sanitation',
|
84 |
+
7:'SDG 7 - Affordable and clean energy',
|
85 |
+
8:'SDG 8 - Decent work and economic growth',
|
86 |
+
9:'SDG 9 - Industry, Innovation and Infrastructure',
|
87 |
+
10:'SDG 10 - Reduced inequality',
|
88 |
+
11:'SDG 11 - Sustainable cities and communities',
|
89 |
+
12:'SDG 12 - Responsible consumption and production',
|
90 |
+
13:'SDG 13 - Climate action',
|
91 |
+
14:'SDG 14 - Life below water',
|
92 |
+
15:'SDG 15 - Life on land',
|
93 |
+
16:'SDG 16 - Peace, justice and strong institutions',
|
94 |
+
17:'SDG 17 - Partnership for the goals',}
|
95 |
+
|
96 |
with st.container():
|
97 |
if st.button("RUN SDG Analysis"):
|
98 |
|
|
|
110 |
|
111 |
df, x = sdg_classification(allDocuments['documents'])
|
112 |
sdg_labels = df.SDG.unique()
|
|
|
113 |
textrankkeywordlist = []
|
114 |
for label in sdg_labels:
|
115 |
sdgdata = " ".join(df[df.SDG == label].text.to_list())
|
116 |
# tfidflist_ = keywordExtraction(label,[sdgdata])
|
117 |
+
textranklist_ = textrank(sdgdata)
|
118 |
+
if len(textranklist_) > 0:
|
119 |
+
# tfidfkeywordList.append({'SDG':label, 'TFIDF Keywords':tfidflist_})
|
120 |
+
textrankkeywordlist.append({'SDG':label, 'TextRank Keywords':textranklist_})
|
121 |
+
# tfidfkeywordsDf = pd.DataFrame(tfidfkeywordList)
|
122 |
tRkeywordsDf = pd.DataFrame(textrankkeywordlist)
|
123 |
|
124 |
|
|
|
126 |
colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
|
127 |
# plot
|
128 |
fig, ax = plt.subplots()
|
129 |
+
ax.pie(x.count, colors=colors, radius=3, center=(4, 4),
|
130 |
wedgeprops={"linewidth": 1, "edgecolor": "white"},
|
131 |
+
frame=False,labels =list(x.SDG_name))
|
132 |
# fig.savefig('temp.png', bbox_inches='tight',dpi= 100)
|
133 |
|
134 |
|
|
|
140 |
st.pyplot(fig)
|
141 |
|
142 |
st.markdown("##### What keywords are present under SDG classified text? #####")
|
|
|
143 |
|
144 |
c1, c2, c3 = st.columns([1, 10, 1])
|
145 |
with c2:
|
146 |
+
st.table(tRkeywordsDf)
|
|
|
|
|
147 |
|
148 |
+
st.markdown("##### Top few SDG Classified paragraph/text results #####")
|
|
|
|
|
149 |
c7, c8, c9 = st.columns([1, 10, 1])
|
150 |
with c8:
|
151 |
+
AgGrid(df)
|
152 |
else:
|
153 |
st.info("🤔 No document found, please try to upload it at the sidebar!")
|
154 |
logging.warning("Terminated as no document provided")
|
paramconfig.cfg
CHANGED
@@ -25,6 +25,7 @@ REMOVE_PUNC = 0
|
|
25 |
SPLIT_LENGTH = 120
|
26 |
SPLIT_OVERLAP = 10
|
27 |
RESPECT_SENTENCE_BOUNDARY = 1
|
|
|
28 |
|
29 |
[preprocessor]
|
30 |
SPLIT_OVERLAP_WORD = 10
|
|
|
25 |
SPLIT_LENGTH = 120
|
26 |
SPLIT_OVERLAP = 10
|
27 |
RESPECT_SENTENCE_BOUNDARY = 1
|
28 |
+
TOP_KEY = 15
|
29 |
|
30 |
[preprocessor]
|
31 |
SPLIT_OVERLAP_WORD = 10
|
requirements.txt
CHANGED
@@ -14,5 +14,6 @@ transformers==4.21.2
|
|
14 |
st-annotated-text==3.0.0
|
15 |
markdown==3.4.1
|
16 |
summa==1.2.0
|
|
|
17 |
python-docx
|
18 |
streamlit_option_menu
|
|
|
14 |
st-annotated-text==3.0.0
|
15 |
markdown==3.4.1
|
16 |
summa==1.2.0
|
17 |
+
streamlit-aggrid
|
18 |
python-docx
|
19 |
streamlit_option_menu
|
utils/keyword_extraction.py
CHANGED
@@ -66,7 +66,12 @@ def keywordExtraction(sdg:int,sdgdata:List[Text]):
|
|
66 |
|
67 |
def textrank(textdata, ratio = 0.1, words = 0):
|
68 |
if words == 0:
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
70 |
else:
|
71 |
results = keywords.keywords(textdata, words= words).split("\n")
|
72 |
|
|
|
66 |
|
67 |
def textrank(textdata, ratio = 0.1, words = 0):
|
68 |
if words == 0:
|
69 |
+
try:
|
70 |
+
words = config.get('sdg','TOP_KEY')
|
71 |
+
results = keywords.keywords(textdata, words = ratio).split("\n")
|
72 |
+
except:
|
73 |
+
logging.warning("paramconfig not found, running textrank with ratio")
|
74 |
+
results = keywords.keywords(textdata, ratio= ratio).split("\n")
|
75 |
else:
|
76 |
results = keywords.keywords(textdata, words= words).split("\n")
|
77 |
|
utils/sdg_classifier.py
CHANGED
@@ -3,6 +3,7 @@ from haystack.schema import Document
|
|
3 |
from typing import List, Tuple
|
4 |
import configparser
|
5 |
import logging
|
|
|
6 |
from pandas import DataFrame, Series
|
7 |
from utils.preprocessing import processingpipeline
|
8 |
try:
|
@@ -17,6 +18,25 @@ except Exception:
|
|
17 |
st.info("Please place the paramconfig file in the same directory as app.py")
|
18 |
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
@st.cache(allow_output_mutation=True)
|
21 |
def load_sdgClassifier():
|
22 |
"""
|
@@ -73,6 +93,10 @@ def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
|
|
73 |
df.index += 1
|
74 |
df =df[df['Relevancy']>threshold]
|
75 |
x = df['SDG'].value_counts()
|
|
|
|
|
|
|
|
|
76 |
df= df.drop(['Relevancy'], axis = 1)
|
77 |
|
78 |
|
|
|
3 |
from typing import List, Tuple
|
4 |
import configparser
|
5 |
import logging
|
6 |
+
import pandas as pd
|
7 |
from pandas import DataFrame, Series
|
8 |
from utils.preprocessing import processingpipeline
|
9 |
try:
|
|
|
18 |
st.info("Please place the paramconfig file in the same directory as app.py")
|
19 |
|
20 |
|
21 |
+
_lab_dict = {0: 'no_cat',
|
22 |
+
1:'SDG 1 - No poverty',
|
23 |
+
2:'SDG 2 - Zero hunger',
|
24 |
+
3:'SDG 3 - Good health and well-being',
|
25 |
+
4:'SDG 4 - Quality education',
|
26 |
+
5:'SDG 5 - Gender equality',
|
27 |
+
6:'SDG 6 - Clean water and sanitation',
|
28 |
+
7:'SDG 7 - Affordable and clean energy',
|
29 |
+
8:'SDG 8 - Decent work and economic growth',
|
30 |
+
9:'SDG 9 - Industry, Innovation and Infrastructure',
|
31 |
+
10:'SDG 10 - Reduced inequality',
|
32 |
+
11:'SDG 11 - Sustainable cities and communities',
|
33 |
+
12:'SDG 12 - Responsible consumption and production',
|
34 |
+
13:'SDG 13 - Climate action',
|
35 |
+
14:'SDG 14 - Life below water',
|
36 |
+
15:'SDG 15 - Life on land',
|
37 |
+
16:'SDG 16 - Peace, justice and strong institutions',
|
38 |
+
17:'SDG 17 - Partnership for the goals',}
|
39 |
+
|
40 |
@st.cache(allow_output_mutation=True)
|
41 |
def load_sdgClassifier():
|
42 |
"""
|
|
|
93 |
df.index += 1
|
94 |
df =df[df['Relevancy']>threshold]
|
95 |
x = df['SDG'].value_counts()
|
96 |
+
x = x.rename('count')
|
97 |
+
x = x.rename_axis('SDG').reset_index()
|
98 |
+
x["SDG"] = pd.to_numeric(x["SDG"])
|
99 |
+
x['SDG_name'] = x['SDG'].apply(lambda x: _lab_dict[x])
|
100 |
df= df.drop(['Relevancy'], axis = 1)
|
101 |
|
102 |
|