prashant
commited on
Commit
•
5bc4948
1
Parent(s):
0e0caa9
adding textrank
Browse files- appStore/sdg_analysis.py +11 -5
- paramconfig.cfg +2 -1
- requirements.txt +1 -0
- utils/keyword_extraction.py +8 -2
appStore/sdg_analysis.py
CHANGED
@@ -14,7 +14,7 @@ from docx.shared import Pt
|
|
14 |
from docx.enum.style import WD_STYLE_TYPE
|
15 |
from utils.sdg_classifier import sdg_classification
|
16 |
from utils.sdg_classifier import runSDGPreprocessingPipeline
|
17 |
-
from utils.keyword_extraction import keywordExtraction
|
18 |
import logging
|
19 |
logger = logging.getLogger(__name__)
|
20 |
|
@@ -59,9 +59,13 @@ def app():
|
|
59 |
keywordList = []
|
60 |
for label in sdg_labels:
|
61 |
sdgdata = " ".join(df[df.SDG == label].text.to_list())
|
62 |
-
|
63 |
-
|
|
|
64 |
keywordsDf = pd.DataFrame(keywordList)
|
|
|
|
|
|
|
65 |
|
66 |
|
67 |
|
@@ -83,11 +87,13 @@ def app():
|
|
83 |
with c5:
|
84 |
st.pyplot(fig)
|
85 |
|
86 |
-
st.markdown("##### What keywords are present under SDG
|
|
|
87 |
|
88 |
-
c1, c2, c3 = st.columns([1,
|
89 |
with c2:
|
90 |
st.table(keywordsDf)
|
|
|
91 |
|
92 |
c7, c8, c9 = st.columns([1, 10, 1])
|
93 |
with c8:
|
|
|
14 |
from docx.enum.style import WD_STYLE_TYPE
|
15 |
from utils.sdg_classifier import sdg_classification
|
16 |
from utils.sdg_classifier import runSDGPreprocessingPipeline
|
17 |
+
from utils.keyword_extraction import keywordExtraction, textrank
|
18 |
import logging
|
19 |
logger = logging.getLogger(__name__)
|
20 |
|
|
|
59 |
keywordList = []
|
60 |
for label in sdg_labels:
|
61 |
sdgdata = " ".join(df[df.SDG == label].text.to_list())
|
62 |
+
tfidflist_ = keywordExtraction(label,[sdgdata])
|
63 |
+
textranklist_ = textrank(sdgdata, words = 20)
|
64 |
+
keywordList.append({'SDG':label, 'TFIDF Keywords':tfidflist_, 'TEXT RANK':textranklist_})
|
65 |
keywordsDf = pd.DataFrame(keywordList)
|
66 |
+
|
67 |
+
|
68 |
+
|
69 |
|
70 |
|
71 |
|
|
|
87 |
with c5:
|
88 |
st.pyplot(fig)
|
89 |
|
90 |
+
st.markdown("##### What keywords are present under SDG classified text? #####")
|
91 |
+
st.write("TFIDF BASED")
|
92 |
|
93 |
+
c1, c2, c3 = st.columns([1, 10, 1])
|
94 |
with c2:
|
95 |
st.table(keywordsDf)
|
96 |
+
|
97 |
|
98 |
c7, c8, c9 = st.columns([1, 10, 1])
|
99 |
with c8:
|
paramconfig.cfg
CHANGED
@@ -30,4 +30,5 @@ SPLIT_OVERLAP_WORD = 10
|
|
30 |
SPLIT_OVERLAP_SENTENCE = 1
|
31 |
|
32 |
[tfidf]
|
33 |
-
TOP_N =
|
|
|
|
30 |
SPLIT_OVERLAP_SENTENCE = 1
|
31 |
|
32 |
[tfidf]
|
33 |
+
TOP_N = 20
|
34 |
+
TEXTRANK_WORDS = 20
|
requirements.txt
CHANGED
@@ -13,5 +13,6 @@ seaborn==0.11.2
|
|
13 |
transformers==4.21.2
|
14 |
st-annotated-text==3.0.0
|
15 |
markdown==3.4.1
|
|
|
16 |
python-docx
|
17 |
streamlit_option_menu
|
|
|
13 |
transformers==4.21.2
|
14 |
st-annotated-text==3.0.0
|
15 |
markdown==3.4.1
|
16 |
+
summa==1.2.0
|
17 |
python-docx
|
18 |
streamlit_option_menu
|
utils/keyword_extraction.py
CHANGED
@@ -7,6 +7,8 @@ import pickle
|
|
7 |
from typing import List, Text
|
8 |
import configparser
|
9 |
import logging
|
|
|
|
|
10 |
try:
|
11 |
from termcolor import colored
|
12 |
except:
|
@@ -62,8 +64,12 @@ def keywordExtraction(sdg:int,sdgdata:List[Text]):
|
|
62 |
keywords = [keyword for keyword in results]
|
63 |
return keywords
|
64 |
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
-
|
67 |
-
|
68 |
|
69 |
|
|
|
7 |
from typing import List, Text
|
8 |
import configparser
|
9 |
import logging
|
10 |
+
from summa import keywords
|
11 |
+
|
12 |
try:
|
13 |
from termcolor import colored
|
14 |
except:
|
|
|
64 |
keywords = [keyword for keyword in results]
|
65 |
return keywords
|
66 |
|
67 |
+
def textrank(textdata, ratio = 0.1, words = 0):
|
68 |
+
if words == 0:
|
69 |
+
results = keywords.keywords(textdata, ratio= ratio).split("\n")
|
70 |
+
else:
|
71 |
+
results = keywords.keywords(textdata, words= words).split("\n")
|
72 |
|
73 |
+
return results
|
|
|
74 |
|
75 |
|