Spaces:
GIZ
/
Running on CPU Upgrade

prashant commited on
Commit
fc140bc
1 Parent(s): 44648c8

final fix in SDG

Browse files
appStore/sdg_analysis.py CHANGED
@@ -11,12 +11,24 @@ import streamlit as st
11
  from st_aggrid import AgGrid
12
  from st_aggrid.shared import ColumnsAutoSizeMode
13
  from utils.sdg_classifier import sdg_classification
14
- from utils.sdg_classifier import runSDGPreprocessingPipeline
15
- from utils.keyword_extraction import keywordExtraction, textrank
16
  import logging
17
  logger = logging.getLogger(__name__)
 
18
 
19
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def app():
22
 
@@ -71,35 +83,22 @@ def app():
71
  """)
72
  st.markdown("")
73
 
74
- ### Label Dictionary ###
75
- _lab_dict = {0: 'no_cat',
76
- 1:'SDG 1 - No poverty',
77
- 2:'SDG 2 - Zero hunger',
78
- 3:'SDG 3 - Good health and well-being',
79
- 4:'SDG 4 - Quality education',
80
- 5:'SDG 5 - Gender equality',
81
- 6:'SDG 6 - Clean water and sanitation',
82
- 7:'SDG 7 - Affordable and clean energy',
83
- 8:'SDG 8 - Decent work and economic growth',
84
- 9:'SDG 9 - Industry, Innovation and Infrastructure',
85
- 10:'SDG 10 - Reduced inequality',
86
- 11:'SDG 11 - Sustainable cities and communities',
87
- 12:'SDG 12 - Responsible consumption and production',
88
- 13:'SDG 13 - Climate action',
89
- 14:'SDG 14 - Life below water',
90
- 15:'SDG 15 - Life on land',
91
- 16:'SDG 16 - Peace, justice and strong institutions',
92
- 17:'SDG 17 - Partnership for the goals',}
93
 
94
  ### Main app code ###
95
  with st.container():
96
  if st.button("RUN SDG Analysis"):
97
-
98
-
99
  if 'filepath' in st.session_state:
100
  file_name = st.session_state['filename']
101
  file_path = st.session_state['filepath']
102
- allDocuments = runSDGPreprocessingPipeline(file_path,file_name)
 
 
 
 
 
 
 
103
  if len(allDocuments['documents']) > 100:
104
  warning_msg = ": This might take sometime, please sit back and relax."
105
  else:
@@ -107,12 +106,15 @@ def app():
107
 
108
  with st.spinner("Running SDG Classification{}".format(warning_msg)):
109
 
110
- df, x = sdg_classification(allDocuments['documents'])
 
 
 
111
  sdg_labels = x.SDG.unique()[::-1]
112
  textrankkeywordlist = []
113
  for label in sdg_labels:
114
  sdgdata = " ".join(df[df.SDG == label].text.to_list())
115
- textranklist_ = textrank(sdgdata)
116
  if len(textranklist_) > 0:
117
  textrankkeywordlist.append({'SDG':label, 'TextRank Keywords':",".join(textranklist_)})
118
  tRkeywordsDf = pd.DataFrame(textrankkeywordlist)
 
11
  from st_aggrid import AgGrid
12
  from st_aggrid.shared import ColumnsAutoSizeMode
13
  from utils.sdg_classifier import sdg_classification
14
+ from utils.sdg_classifier import runSDGPreprocessingPipeline, load_sdgClassifier
15
+ from utils.keyword_extraction import textrank
16
  import logging
17
  logger = logging.getLogger(__name__)
18
+ from utils.checkconfig import getconfig
19
 
20
 
21
+ # Declare all the necessary variables
22
+ config = getconfig('paramconfig.cfg')
23
+ model_name = config.get('sdg','MODEL')
24
+ split_by = config.get('sdg','SPLIT_BY')
25
+ split_length = int(config.get('sdg','SPLIT_LENGTH'))
26
+ split_overlap = int(config.get('sdg','SPLIT_OVERLAP'))
27
+ remove_punc = bool(int(config.get('sdg','REMOVE_PUNC')))
28
+ split_respect_sentence_boundary = bool(int(config.get('sdg','RESPECT_SENTENCE_BOUNDARY')))
29
+ threshold = float(config.get('sdg','THRESHOLD'))
30
+ top_n = int(config.get('sdg','TOP_KEY'))
31
+
32
 
33
  def app():
34
 
 
83
  """)
84
  st.markdown("")
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  ### Main app code ###
88
  with st.container():
89
  if st.button("RUN SDG Analysis"):
90
+
 
91
  if 'filepath' in st.session_state:
92
  file_name = st.session_state['filename']
93
  file_path = st.session_state['filepath']
94
+ classifier = load_sdgClassifier(model_name)
95
+ allDocuments = runSDGPreprocessingPipeline(fileName= file_name,
96
+ filePath= file_path, split_by= split_by,
97
+ split_length= split_length,
98
+ split_overlap= split_overlap,
99
+ split_respect_sentence_boundary= split_respect_sentence_boundary,
100
+ removePunc= remove_punc)
101
+
102
  if len(allDocuments['documents']) > 100:
103
  warning_msg = ": This might take sometime, please sit back and relax."
104
  else:
 
106
 
107
  with st.spinner("Running SDG Classification{}".format(warning_msg)):
108
 
109
+ df, x = sdg_classification(haystackdoc=allDocuments['documents'],
110
+ threshold= threshold,
111
+ classifiermodel= classifier)
112
+ df = df.drop(['Relevancy'], axis = 1)
113
  sdg_labels = x.SDG.unique()[::-1]
114
  textrankkeywordlist = []
115
  for label in sdg_labels:
116
  sdgdata = " ".join(df[df.SDG == label].text.to_list())
117
+ textranklist_ = textrank(textdata=sdgdata, words= top_n)
118
  if len(textranklist_) > 0:
119
  textrankkeywordlist.append({'SDG':label, 'TextRank Keywords':",".join(textranklist_)})
120
  tRkeywordsDf = pd.DataFrame(textrankkeywordlist)
utils/checkconfig.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import configparser
2
+ import logging
3
+
4
+ def getconfig(configFilePath):
5
+
6
+ config = configparser.ConfigParser()
7
+
8
+ try:
9
+ config.read_file(open(configFilePath))
10
+ return config
11
+ except:
12
+ logging.warning("config file not found")
utils/keyword_extraction.py CHANGED
@@ -5,25 +5,13 @@ import pandas as pd
5
  # from nltk.corpus import stopwords
6
  import pickle
7
  from typing import List, Text
8
- import configparser
9
  import logging
10
  from summa import keywords
11
 
12
- try:
13
- from termcolor import colored
14
- except:
15
- pass
16
-
17
  try:
18
  import streamlit as st
19
  except ImportError:
20
  logging.info("Streamlit not installed")
21
- config = configparser.ConfigParser()
22
- try:
23
- config.read_file(open('paramconfig.cfg'))
24
- except Exception:
25
- logging.warning("paramconfig file not found")
26
- st.info("Please place the paramconfig file in the same directory as app.py")
27
 
28
 
29
  def sort_coo(coo_matrix):
@@ -69,6 +57,30 @@ def extract_topn_from_vector(feature_names, sorted_items, top_n=10):
69
 
70
  return results
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  def keywordExtraction(sdg:int,sdgdata:List[Text]):
73
  """
74
  TFIDF based keywords extraction
@@ -115,12 +127,13 @@ def textrank(textdata:Text, ratio:float = 0.1, words = 0):
115
  results: extracted keywords
116
  """
117
  if words == 0:
118
- try:
119
- words = int(config.get('sdg','TOP_KEY'))
120
- results = keywords.keywords(textdata, words = words).split("\n")
121
- except Exception as e:
122
- logging.warning(e)
123
- results = keywords.keywords(textdata, ratio= ratio).split("\n")
 
124
  else:
125
  try:
126
  results = keywords.keywords(textdata, words= words).split("\n")
 
5
  # from nltk.corpus import stopwords
6
  import pickle
7
  from typing import List, Text
 
8
  import logging
9
  from summa import keywords
10
 
 
 
 
 
 
11
  try:
12
  import streamlit as st
13
  except ImportError:
14
  logging.info("Streamlit not installed")
 
 
 
 
 
 
15
 
16
 
17
  def sort_coo(coo_matrix):
 
57
 
58
  return results
59
 
60
+
61
+ def tfidfKeyword(textdata, vectorizer, tfidfmodel, top_n):
62
+ """
63
+ TFIDF based keywords extraction
64
+
65
+ Params
66
+ ---------
67
+ vectorizer: trained cont vectorizer model
68
+ tfidfmodel: TFIDF Tranformer model
69
+ top_n: Top N keywords to be extracted
70
+ textdata: text data to which needs keyword extraction
71
+
72
+ Return
73
+ ----------
74
+ keywords: top extracted keywords
75
+
76
+ """
77
+ features = vectorizer.get_feature_names_out()
78
+ tf_idf_vector=tfidfmodel.transform(vectorizer.transform(textdata))
79
+ sorted_items=sort_coo(tf_idf_vector.tocoo())
80
+ results=extract_topn_from_vector(features,sorted_items,top_n)
81
+ keywords = [keyword for keyword in results]
82
+ return keywords
83
+
84
  def keywordExtraction(sdg:int,sdgdata:List[Text]):
85
  """
86
  TFIDF based keywords extraction
 
127
  results: extracted keywords
128
  """
129
  if words == 0:
130
+ # try:
131
+ # words = int(config.get('sdg','TOP_KEY'))
132
+ # results = keywords.keywords(textdata, words = words).split("\n")
133
+ # except Exception as e:
134
+ # logging.warning(e)
135
+ logging.info("Textrank using defulat ratio value = 0.1, as no words limit given")
136
+ results = keywords.keywords(textdata, ratio= ratio).split("\n")
137
  else:
138
  try:
139
  results = keywords.keywords(textdata, words= words).split("\n")
utils/preprocessing.py CHANGED
@@ -179,6 +179,8 @@ class UdfPreProcessor(BaseComponent):
179
  split_by: document splitting strategy either as word or sentence
180
  split_length: when synthetically creating the paragrpahs from document,
181
  it defines the length of paragraph.
 
 
182
 
183
  Return
184
  ---------
 
179
  split_by: document splitting strategy either as word or sentence
180
  split_length: when synthetically creating the paragrpahs from document,
181
  it defines the length of paragraph.
182
+ split_respect_sentence_boundary: Used when using 'word' strategy for
183
+ splititng of text.
184
 
185
  Return
186
  ---------
utils/sdg_classifier.py CHANGED
@@ -1,63 +1,55 @@
1
  from haystack.nodes import TransformersDocumentClassifier
2
  from haystack.schema import Document
3
- from typing import List, Tuple
 
4
  import configparser
5
  import logging
6
  import pandas as pd
7
  from pandas import DataFrame, Series
 
8
  from utils.preprocessing import processingpipeline
9
  try:
10
  import streamlit as st
11
  except ImportError:
12
  logging.info("Streamlit not installed")
13
- config = configparser.ConfigParser()
14
- try:
15
- config.read_file(open('paramconfig.cfg'))
16
- except Exception:
17
- logging.info("paramconfig file not found")
18
- st.info("Please place the paramconfig file in the same directory as app.py")
19
-
20
-
21
- _lab_dict = {0: 'no_cat',
22
- 1:'SDG 1 - No poverty',
23
- 2:'SDG 2 - Zero hunger',
24
- 3:'SDG 3 - Good health and well-being',
25
- 4:'SDG 4 - Quality education',
26
- 5:'SDG 5 - Gender equality',
27
- 6:'SDG 6 - Clean water and sanitation',
28
- 7:'SDG 7 - Affordable and clean energy',
29
- 8:'SDG 8 - Decent work and economic growth',
30
- 9:'SDG 9 - Industry, Innovation and Infrastructure',
31
- 10:'SDG 10 - Reduced inequality',
32
- 11:'SDG 11 - Sustainable cities and communities',
33
- 12:'SDG 12 - Responsible consumption and production',
34
- 13:'SDG 13 - Climate action',
35
- 14:'SDG 14 - Life below water',
36
- 15:'SDG 15 - Life on land',
37
- 16:'SDG 16 - Peace, justice and strong institutions',
38
- 17:'SDG 17 - Partnership for the goals',}
39
 
40
  @st.cache(allow_output_mutation=True)
41
- def load_sdgClassifier():
42
  """
43
  loads the document classifier using haystack, where the name/path of model
44
- in HF-hub as string is used to fetch the model object.
 
45
  1. https://docs.haystack.deepset.ai/reference/document-classifier-api
46
  2. https://docs.haystack.deepset.ai/docs/document_classifier
47
 
 
 
 
 
 
 
 
48
  Return: document classifier model
49
  """
50
- logging.info("Loading classifier")
51
- doc_classifier_model = config.get('sdg','MODEL')
 
 
 
 
 
 
 
52
  doc_classifier = TransformersDocumentClassifier(
53
- model_name_or_path=doc_classifier_model,
54
- task="text-classification")
55
- return doc_classifier
56
-
57
 
 
 
58
 
59
  @st.cache(allow_output_mutation=True)
60
- def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
 
61
  """
62
  Text-Classification on the list of texts provided. Classifier provides the
63
  most appropriate label for each text. these labels are in terms of if text
@@ -77,11 +69,7 @@ def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
77
 
78
  """
79
  logging.info("Working on SDG Classification")
80
- threshold = float(config.get('sdg','THRESHOLD'))
81
-
82
-
83
- classifier = load_sdgClassifier()
84
- results = classifier.predict(haystackdoc)
85
 
86
 
87
  labels_= [(l.meta['classification']['label'],
@@ -92,6 +80,8 @@ def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
92
  df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
93
  df.index += 1
94
  df =df[df['Relevancy']>threshold]
 
 
95
  x = df['SDG'].value_counts()
96
  x = x.rename('count')
97
  x = x.rename_axis('SDG').reset_index()
@@ -99,14 +89,17 @@ def sdg_classification(haystackdoc:List[Document])->Tuple[DataFrame,Series]:
99
  x = x.sort_values(by=['count'])
100
  x['SDG_name'] = x['SDG'].apply(lambda x: _lab_dict[x])
101
  x['SDG_Num'] = x['SDG'].apply(lambda x: "SDG "+str(x))
102
- df= df.drop(['Relevancy'], axis = 1)
103
  df['SDG'] = pd.to_numeric(df['SDG'])
104
  df = df.sort_values('SDG')
105
 
106
-
107
  return df, x
108
 
109
- def runSDGPreprocessingPipeline(filePath, fileName)->List[Document]:
 
 
 
 
110
  """
111
  creates the pipeline and runs the preprocessing pipeline,
112
  the params for pipeline are fetched from paramconfig
@@ -117,7 +110,12 @@ def runSDGPreprocessingPipeline(filePath, fileName)->List[Document]:
117
  file_name: filename, in case of streamlit application use
118
  st.session_state['filename']
119
  file_path: filepath, in case of streamlit application use
120
- st.session_state['filepath']
 
 
 
 
 
121
 
122
 
123
  Return
@@ -130,16 +128,11 @@ def runSDGPreprocessingPipeline(filePath, fileName)->List[Document]:
130
  """
131
 
132
  sdg_processing_pipeline = processingpipeline()
133
- split_by = config.get('sdg','SPLIT_BY')
134
- split_length = int(config.get('sdg','SPLIT_LENGTH'))
135
- split_overlap = int(config.get('sdg','SPLIT_OVERLAP'))
136
- remove_punc = bool(int(config.get('sdg','REMOVE_PUNC')))
137
- split_respect_sentence_boundary = bool(int(config.get('sdg','RESPECT_SENTENCE_BOUNDARY')))
138
 
139
  output_sdg_pre = sdg_processing_pipeline.run(file_paths = filePath,
140
  params= {"FileConverter": {"file_path": filePath, \
141
  "file_name": fileName},
142
- "UdfPreProcessor": {"removePunc": remove_punc, \
143
  "split_by": split_by, \
144
  "split_length":split_length,\
145
  "split_overlap": split_overlap, \
 
1
  from haystack.nodes import TransformersDocumentClassifier
2
  from haystack.schema import Document
3
+ from typing import List, Tuple, Float
4
+ from typing_extensions import Literal
5
  import configparser
6
  import logging
7
  import pandas as pd
8
  from pandas import DataFrame, Series
9
+ from utils.checkconfig import getconfig
10
  from utils.preprocessing import processingpipeline
11
  try:
12
  import streamlit as st
13
  except ImportError:
14
  logging.info("Streamlit not installed")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  @st.cache(allow_output_mutation=True)
17
+ def load_sdgClassifier(configFile = None, docClassifierModel = None):
18
  """
19
  loads the document classifier using haystack, where the name/path of model
20
+ in HF-hub as string is used to fetch the model object.Either configfile or
21
+ model should be passed.
22
  1. https://docs.haystack.deepset.ai/reference/document-classifier-api
23
  2. https://docs.haystack.deepset.ai/docs/document_classifier
24
 
25
+ Params
26
+ --------
27
+ configFile: config file from which to read the model name
28
+ docClassifierModel: if modelname is passed, it takes a priority if not \
29
+ found then will look for configfile, else raise error.
30
+
31
+
32
  Return: document classifier model
33
  """
34
+ if not docClassifierModel:
35
+ if not configFile:
36
+ logging.warning("Pass either model name or config file")
37
+ return
38
+ else:
39
+ config = getconfig(configFile)
40
+ docClassifierModel = config.get('sdg','MODEL')
41
+
42
+ logging.info("Loading classifier")
43
  doc_classifier = TransformersDocumentClassifier(
44
+ model_name_or_path=docClassifierModel,
45
+ task="text-classification")
 
 
46
 
47
+ return doc_classifier
48
+
49
 
50
  @st.cache(allow_output_mutation=True)
51
+ def sdg_classification(haystackdoc:List[Document],
52
+ threshold:Float, classifiermodel)->Tuple[DataFrame,Series]:
53
  """
54
  Text-Classification on the list of texts provided. Classifier provides the
55
  most appropriate label for each text. these labels are in terms of if text
 
69
 
70
  """
71
  logging.info("Working on SDG Classification")
72
+ results = classifiermodel.predict(haystackdoc)
 
 
 
 
73
 
74
 
75
  labels_= [(l.meta['classification']['label'],
 
80
  df = df.sort_values(by="Relevancy", ascending=False).reset_index(drop=True)
81
  df.index += 1
82
  df =df[df['Relevancy']>threshold]
83
+
84
+ # creating the dataframe for value counts of SDG, along with 'title' of SDGs
85
  x = df['SDG'].value_counts()
86
  x = x.rename('count')
87
  x = x.rename_axis('SDG').reset_index()
 
89
  x = x.sort_values(by=['count'])
90
  x['SDG_name'] = x['SDG'].apply(lambda x: _lab_dict[x])
91
  x['SDG_Num'] = x['SDG'].apply(lambda x: "SDG "+str(x))
92
+
93
  df['SDG'] = pd.to_numeric(df['SDG'])
94
  df = df.sort_values('SDG')
95
 
 
96
  return df, x
97
 
98
+ def runSDGPreprocessingPipeline(filePath, fileName,
99
+ split_by: Literal["sentence", "word"] = 'sentence',
100
+ split_respect_sentence_boundary = False,
101
+ split_length:int = 2, split_overlap = 0,
102
+ removePunc = False)->List[Document]:
103
  """
104
  creates the pipeline and runs the preprocessing pipeline,
105
  the params for pipeline are fetched from paramconfig
 
110
  file_name: filename, in case of streamlit application use
111
  st.session_state['filename']
112
  file_path: filepath, in case of streamlit application use
113
+ removePunc: to remove all Punctuation including ',' and '.' or not
114
+ split_by: document splitting strategy either as word or sentence
115
+ split_length: when synthetically creating the paragrpahs from document,
116
+ it defines the length of paragraph.
117
+ split_respect_sentence_boundary: Used when using 'word' strategy for
118
+ splititng of text.
119
 
120
 
121
  Return
 
128
  """
129
 
130
  sdg_processing_pipeline = processingpipeline()
 
 
 
 
 
131
 
132
  output_sdg_pre = sdg_processing_pipeline.run(file_paths = filePath,
133
  params= {"FileConverter": {"file_path": filePath, \
134
  "file_name": fileName},
135
+ "UdfPreProcessor": {"removePunc": removePunc, \
136
  "split_by": split_by, \
137
  "split_length":split_length,\
138
  "split_overlap": split_overlap, \