Spaces:
GIZ
/
Running on CPU Upgrade

prashant commited on
Commit
5bc4948
1 Parent(s): 0e0caa9

adding textrank

Browse files
appStore/sdg_analysis.py CHANGED
@@ -14,7 +14,7 @@ from docx.shared import Pt
14
  from docx.enum.style import WD_STYLE_TYPE
15
  from utils.sdg_classifier import sdg_classification
16
  from utils.sdg_classifier import runSDGPreprocessingPipeline
17
- from utils.keyword_extraction import keywordExtraction
18
  import logging
19
  logger = logging.getLogger(__name__)
20
 
@@ -59,9 +59,13 @@ def app():
59
  keywordList = []
60
  for label in sdg_labels:
61
  sdgdata = " ".join(df[df.SDG == label].text.to_list())
62
- list_ = keywordExtraction(label,[sdgdata])
63
- keywordList.append({'SDG':label, 'Keywords':list_})
 
64
  keywordsDf = pd.DataFrame(keywordList)
 
 
 
65
 
66
 
67
 
@@ -83,11 +87,13 @@ def app():
83
  with c5:
84
  st.pyplot(fig)
85
 
86
- st.markdown("##### What keywords are present under SDG labels? #####")
 
87
 
88
- c1, c2, c3 = st.columns([1, 3, 1])
89
  with c2:
90
  st.table(keywordsDf)
 
91
 
92
  c7, c8, c9 = st.columns([1, 10, 1])
93
  with c8:
 
14
  from docx.enum.style import WD_STYLE_TYPE
15
  from utils.sdg_classifier import sdg_classification
16
  from utils.sdg_classifier import runSDGPreprocessingPipeline
17
+ from utils.keyword_extraction import keywordExtraction, textrank
18
  import logging
19
  logger = logging.getLogger(__name__)
20
 
 
59
  keywordList = []
60
  for label in sdg_labels:
61
  sdgdata = " ".join(df[df.SDG == label].text.to_list())
62
+ tfidflist_ = keywordExtraction(label,[sdgdata])
63
+ textranklist_ = textrank(sdgdata, words = 20)
64
+ keywordList.append({'SDG':label, 'TFIDF Keywords':tfidflist_, 'TEXT RANK':textranklist_})
65
  keywordsDf = pd.DataFrame(keywordList)
66
+
67
+
68
+
69
 
70
 
71
 
 
87
  with c5:
88
  st.pyplot(fig)
89
 
90
+ st.markdown("##### What keywords are present under SDG classified text? #####")
91
+ st.write("TFIDF BASED")
92
 
93
+ c1, c2, c3 = st.columns([1, 10, 1])
94
  with c2:
95
  st.table(keywordsDf)
96
+
97
 
98
  c7, c8, c9 = st.columns([1, 10, 1])
99
  with c8:
paramconfig.cfg CHANGED
@@ -30,4 +30,5 @@ SPLIT_OVERLAP_WORD = 10
30
  SPLIT_OVERLAP_SENTENCE = 1
31
 
32
  [tfidf]
33
- TOP_N = 10
 
 
30
  SPLIT_OVERLAP_SENTENCE = 1
31
 
32
  [tfidf]
33
+ TOP_N = 20
34
+ TEXTRANK_WORDS = 20
requirements.txt CHANGED
@@ -13,5 +13,6 @@ seaborn==0.11.2
13
  transformers==4.21.2
14
  st-annotated-text==3.0.0
15
  markdown==3.4.1
 
16
  python-docx
17
  streamlit_option_menu
 
13
  transformers==4.21.2
14
  st-annotated-text==3.0.0
15
  markdown==3.4.1
16
+ summa==1.2.0
17
  python-docx
18
  streamlit_option_menu
utils/keyword_extraction.py CHANGED
@@ -7,6 +7,8 @@ import pickle
7
  from typing import List, Text
8
  import configparser
9
  import logging
 
 
10
  try:
11
  from termcolor import colored
12
  except:
@@ -62,8 +64,12 @@ def keywordExtraction(sdg:int,sdgdata:List[Text]):
62
  keywords = [keyword for keyword in results]
63
  return keywords
64
 
 
 
 
 
 
65
 
66
-
67
-
68
 
69
 
 
7
  from typing import List, Text
8
  import configparser
9
  import logging
10
+ from summa import keywords
11
+
12
  try:
13
  from termcolor import colored
14
  except:
 
64
  keywords = [keyword for keyword in results]
65
  return keywords
66
 
67
+ def textrank(textdata, ratio = 0.1, words = 0):
68
+ if words == 0:
69
+ results = keywords.keywords(textdata, ratio= ratio).split("\n")
70
+ else:
71
+ results = keywords.keywords(textdata, words= words).split("\n")
72
 
73
+ return results
 
74
 
75