Spaces:
GIZ
/
Running on CPU Upgrade

prashant commited on
Commit
0e0caa9
1 Parent(s): c73b10a

tfidf implementation

Browse files
appStore/sdg_analysis.py CHANGED
@@ -6,6 +6,7 @@ sys.path.append('../utils')
6
  import seaborn as sns
7
  import matplotlib.pyplot as plt
8
  import numpy as np
 
9
  import streamlit as st
10
  import docx
11
  from docx.shared import Inches
@@ -13,6 +14,7 @@ from docx.shared import Pt
13
  from docx.enum.style import WD_STYLE_TYPE
14
  from utils.sdg_classifier import sdg_classification
15
  from utils.sdg_classifier import runSDGPreprocessingPipeline
 
16
  import logging
17
  logger = logging.getLogger(__name__)
18
 
@@ -53,6 +55,16 @@ def app():
53
  with st.spinner("Running SDG Classification{}".format(warning_msg)):
54
 
55
  df, x = sdg_classification(allDocuments['documents'])
 
 
 
 
 
 
 
 
 
 
56
 
57
  plt.rcParams['font.size'] = 25
58
  colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
@@ -62,12 +74,20 @@ def app():
62
  wedgeprops={"linewidth": 1, "edgecolor": "white"},
63
  frame=False,labels =list(x.index))
64
  # fig.savefig('temp.png', bbox_inches='tight',dpi= 100)
 
 
65
  st.markdown("#### Anything related to SDGs? ####")
66
 
67
  c4, c5, c6 = st.columns([2, 2, 2])
68
 
69
  with c5:
70
  st.pyplot(fig)
 
 
 
 
 
 
71
 
72
  c7, c8, c9 = st.columns([1, 10, 1])
73
  with c8:
 
6
  import seaborn as sns
7
  import matplotlib.pyplot as plt
8
  import numpy as np
9
+ import pandas as pd
10
  import streamlit as st
11
  import docx
12
  from docx.shared import Inches
 
14
  from docx.enum.style import WD_STYLE_TYPE
15
  from utils.sdg_classifier import sdg_classification
16
  from utils.sdg_classifier import runSDGPreprocessingPipeline
17
+ from utils.keyword_extraction import keywordExtraction
18
  import logging
19
  logger = logging.getLogger(__name__)
20
 
 
55
  with st.spinner("Running SDG Classification{}".format(warning_msg)):
56
 
57
  df, x = sdg_classification(allDocuments['documents'])
58
+ sdg_labels = df.SDG.unique()
59
+ keywordList = []
60
+ for label in sdg_labels:
61
+ sdgdata = " ".join(df[df.SDG == label].text.to_list())
62
+ list_ = keywordExtraction(label,[sdgdata])
63
+ keywordList.append({'SDG':label, 'Keywords':list_})
64
+ keywordsDf = pd.DataFrame(keywordList)
65
+
66
+
67
+
68
 
69
  plt.rcParams['font.size'] = 25
70
  colors = plt.get_cmap('Blues')(np.linspace(0.2, 0.7, len(x)))
 
74
  wedgeprops={"linewidth": 1, "edgecolor": "white"},
75
  frame=False,labels =list(x.index))
76
  # fig.savefig('temp.png', bbox_inches='tight',dpi= 100)
77
+
78
+
79
  st.markdown("#### Anything related to SDGs? ####")
80
 
81
  c4, c5, c6 = st.columns([2, 2, 2])
82
 
83
  with c5:
84
  st.pyplot(fig)
85
+
86
+ st.markdown("##### What keywords are present under SDG labels? #####")
87
+
88
+ c1, c2, c3 = st.columns([1, 3, 1])
89
+ with c2:
90
+ st.table(keywordsDf)
91
 
92
  c7, c8, c9 = st.columns([1, 10, 1])
93
  with c8:
docStore/sdg1/tfidfmodel.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7654f8b50afb0701e99a7906e11a65f9b45939edc9ac8468fac735b161f825db
3
+ size 346162
docStore/sdg1/vectorizer.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddf6f71ce6190227ab07dea06808701b9d364ff254c5745b6edd493fe716cbd1
3
+ size 2919669
docStore/sdg10/tfidfmodel.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06853c03f3185b9fa884e1ebc7c03311cf2e152cc7bd8fe62121a51851d903c1
3
+ size 141634
docStore/sdg10/vectorizer.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e58bfa7e608c4a4f56e1e34af6abab9c4e5304ad80f125a7d066303f6d97c2b2
3
+ size 1358391
docStore/sdg11/tfidfmodel.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b490b5d0a6097bf58342e8727acb4971fbdbf9853bdacaade22cc66284aa340
3
+ size 273522
docStore/sdg11/vectorizer.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09a9bba4d948ad1eed1e1366ffaf7904ff2d3a78270f4901b21e99a4420c9989
3
+ size 2741473
docStore/sdg12/tfidfmodel.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bcc76a3b074bcc6b0b8ccb9c313ce503171da08850a062d37e085a93ade4ef2
3
+ size 61282
docStore/sdg12/vectorizer.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ba7319943e5751b35b7641d842b5457f33d7de9b68d4943250a3e8d6ef98463
3
+ size 636060
docStore/sdg13/tfidfmodel.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b381b7afe2aac6a1e48bf2fa32b6aac31279d8cd536ce877e21bee386e36b49
3
+ size 286354
docStore/sdg13/vectorizer.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e96785effd072fe33332219f74b67da7cf522e709294e296ed24660809cdd13
3
+ size 2435032
docStore/sdg14/tfidfmodel.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:555e0c643224dfdc88121e9297b9593f8413fb189db73c98c7aaf93474162fe4
3
+ size 146354
docStore/sdg14/vectorizer.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00d3c066d63094799820fd76ff1bedaceb9ec14c898f23ada6a8f82ab7fe3d6e
3
+ size 1386363
docStore/sdg15/tfidfmodel.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:106415f3cd02691b4f384119d97d29334af89c5b905ce288cfd69a2f5ded31af
3
+ size 121186
docStore/sdg15/vectorizer.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d43b2bf946b348a15779b3c34c257ae162db00687d8cdbdf3f083d78caf63f63
3
+ size 1233312
docStore/sdg2/tfidfmodel.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33b9f94015eada5ec8cc288769ff0bb7edce2ae2448f1ec15d292b05ecac35f6
3
+ size 302482
docStore/sdg2/vectorizer.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a026f1442d52657d9e32f31a3b5786aeed5b4c6c10c7882b572d7314d60145e2
3
+ size 2832751
docStore/sdg3/tfidfmodel.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b893cdbae8015ad6e31b193ec090c1d3e7cdfb0f55050a19f9d57d5c651f948
3
+ size 327186
docStore/sdg3/vectorizer.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d187b4a6928eac29e4b3fb1cdb9461fa6d73af3b884d3a7dd3b1dfe558bd5a8
3
+ size 3022375
docStore/sdg4/tfidfmodel.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b43b2884b872a533fd1f54c4e979f94abdd62b098aac32e169c8046589ea40e
3
+ size 451330
docStore/sdg4/vectorizer.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:147f45e52c42870809b5b84a8e3a0a49c394106234f63583ba843ff381651441
3
+ size 3901248
docStore/sdg5/tfidfmodel.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73c258c9befe2c83952fd627ef74a99e377b65445961bc08f7661e72855e108c
3
+ size 534434
docStore/sdg5/vectorizer.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d374aa813abf087c64cd224c4dd3bbbb5fc0ebc6935c08ccca29a78bf816ffc
3
+ size 4695356
docStore/sdg6/tfidfmodel.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e983449ba8c8845f8ad909a100693e72f48e497e0f4d1c9f0210c1e3786dbbb
3
+ size 329554
docStore/sdg6/vectorizer.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b51d687346dbe352862796d7ebd841d8c4322ebe51f19d8a72505ebfef03960
3
+ size 3183206
docStore/sdg7/tfidfmodel.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:645ed067af827accf1d150cc1c9a6a471f7a5162c3e778352cc86f8533edb4b1
3
+ size 365442
docStore/sdg7/vectorizer.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a4bff8c2ec1329563cc5b861697d705b2e6314f4838c25c30a4b44ccc715107
3
+ size 3427796
docStore/sdg8/tfidfmodel.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3d8f49806436a9d74ac1c813e54ea18ceda6251a92e0a10cf582cd16732e9cb
3
+ size 193730
docStore/sdg8/vectorizer.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d655643005c76a10d1cf81fadb239468e3147cc45d6f3d8ae06d61374f1954e3
3
+ size 1876360
docStore/sdg9/tfidfmodel.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10ad6f463d16ebdc5ef1d2da9095634803663a67c780b3ec19e6db61988d744c
3
+ size 141202
docStore/sdg9/vectorizer.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a6c9867a76dec849638864fa1463033b0620f9611dc6cdbac22c1d7a8cbf9b8
3
- size 2520
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3341e4f9b16a810283e2be1ced50f3c1dddde24ace907a9751fe8b7cd89f553
3
+ size 1433825
paramconfig.cfg CHANGED
@@ -28,3 +28,6 @@ SPLIT_OVERLAP = 10
28
  [preprocessor]
29
  SPLIT_OVERLAP_WORD = 10
30
  SPLIT_OVERLAP_SENTENCE = 1
 
 
 
 
28
  [preprocessor]
29
  SPLIT_OVERLAP_WORD = 10
30
  SPLIT_OVERLAP_SENTENCE = 1
31
+
32
+ [tfidf]
33
+ TOP_N = 10
utils/keyword_extraction.py CHANGED
@@ -1,9 +1,27 @@
1
  import pandas as pd
2
- from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
3
- import nltk
4
- nltk.download('stopwords')
5
- from nltk.corpus import stopwords
6
  import pickle
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
 
9
  def sort_coo(coo_matrix):
@@ -32,11 +50,20 @@ def extract_topn_from_vector(feature_names, sorted_items, topn=10):
32
 
33
  return results
34
 
35
- def keyword_extraction(sdg:int,sdgdata):
36
  model_path = "docStore/sdg{}/".format(sdg)
37
  vectorizer = pickle.load(open(model_path+'vectorizer.pkl', 'rb'))
38
  tfidfmodel = pickle.load(open(model_path+'tfidfmodel.pkl', 'rb'))
39
  features = vectorizer.get_feature_names_out()
 
 
 
 
 
 
 
 
 
40
 
41
 
42
 
 
1
  import pandas as pd
2
+ # from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
3
+ # import nltk
4
+ # nltk.download('stopwords')
5
+ # from nltk.corpus import stopwords
6
  import pickle
7
+ from typing import List, Text
8
+ import configparser
9
+ import logging
10
+ try:
11
+ from termcolor import colored
12
+ except:
13
+ pass
14
+
15
+ try:
16
+ import streamlit as st
17
+ except ImportError:
18
+ logging.info("Streamlit not installed")
19
+ config = configparser.ConfigParser()
20
+ try:
21
+ config.read_file(open('paramconfig.cfg'))
22
+ except Exception:
23
+ logging.warning("paramconfig file not found")
24
+ st.info("Please place the paramconfig file in the same directory as app.py")
25
 
26
 
27
  def sort_coo(coo_matrix):
 
50
 
51
  return results
52
 
53
+ def keywordExtraction(sdg:int,sdgdata:List[Text]):
54
  model_path = "docStore/sdg{}/".format(sdg)
55
  vectorizer = pickle.load(open(model_path+'vectorizer.pkl', 'rb'))
56
  tfidfmodel = pickle.load(open(model_path+'tfidfmodel.pkl', 'rb'))
57
  features = vectorizer.get_feature_names_out()
58
+ tf_idf_vector=tfidfmodel.transform(vectorizer.transform(sdgdata))
59
+ sorted_items=sort_coo(tf_idf_vector.tocoo())
60
+ top_n = int(config.get('tfidf', 'TOP_N'))
61
+ results=extract_topn_from_vector(features,sorted_items,top_n)
62
+ keywords = [keyword for keyword in results]
63
+ return keywords
64
+
65
+
66
+
67
 
68
 
69
 
utils/uploadAndExample.py CHANGED
@@ -18,9 +18,6 @@ def add_upload(choice):
18
  bytes_data = uploaded_file.getvalue()
19
  temp.write(bytes_data)
20
  st.session_state['filename'] = uploaded_file.name
21
- # file_name = uploaded_file.name
22
- # file_path = temp.name
23
- # st.session_state['filename'] = file_name
24
  st.session_state['filepath'] = temp.name
25
 
26
 
 
18
  bytes_data = uploaded_file.getvalue()
19
  temp.write(bytes_data)
20
  st.session_state['filename'] = uploaded_file.name
 
 
 
21
  st.session_state['filepath'] = temp.name
22
 
23