prashant
commited on
Commit
•
53e0cf4
1
Parent(s):
c8b3108
keyword extraction update
Browse files- paramconfig.cfg +0 -1
- utils/keyword_extraction.py +49 -4
- utils/uploadAndExample.py +0 -8
paramconfig.cfg
CHANGED
@@ -33,4 +33,3 @@ SPLIT_OVERLAP_SENTENCE = 1
|
|
33 |
|
34 |
[tfidf]
|
35 |
TOP_N = 20
|
36 |
-
TEXTRANK_WORDS = 20
|
|
|
33 |
|
34 |
[tfidf]
|
35 |
TOP_N = 20
|
|
utils/keyword_extraction.py
CHANGED
@@ -27,14 +27,31 @@ except Exception:
|
|
27 |
|
28 |
|
29 |
def sort_coo(coo_matrix):
|
|
|
|
|
|
|
|
|
30 |
tuples = zip(coo_matrix.col, coo_matrix.data)
|
31 |
return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
|
32 |
|
33 |
-
def extract_topn_from_vector(feature_names, sorted_items,
|
34 |
-
"""get the feature names and tf-idf score of top n items
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
#use only topn items from vector
|
37 |
-
sorted_items = sorted_items[:
|
38 |
score_vals = []
|
39 |
feature_vals = []
|
40 |
|
@@ -53,6 +70,20 @@ def extract_topn_from_vector(feature_names, sorted_items, topn=10):
|
|
53 |
return results
|
54 |
|
55 |
def keywordExtraction(sdg:int,sdgdata:List[Text]):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
model_path = "docStore/sdg{}/".format(sdg)
|
57 |
vectorizer = pickle.load(open(model_path+'vectorizer.pkl', 'rb'))
|
58 |
tfidfmodel = pickle.load(open(model_path+'tfidfmodel.pkl', 'rb'))
|
@@ -64,7 +95,21 @@ def keywordExtraction(sdg:int,sdgdata:List[Text]):
|
|
64 |
keywords = [keyword for keyword in results]
|
65 |
return keywords
|
66 |
|
67 |
-
def textrank(textdata, ratio = 0.1, words = 0):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
if words == 0:
|
69 |
try:
|
70 |
words = int(config.get('sdg','TOP_KEY'))
|
|
|
27 |
|
28 |
|
29 |
def sort_coo(coo_matrix):
|
30 |
+
"""
|
31 |
+
It takes Coordinate format scipy sparse matrix and extracts info from same.\
|
32 |
+
1. https://kavita-ganesan.com/python-keyword-extraction/#.Y2-TFHbMJPb
|
33 |
+
"""
|
34 |
tuples = zip(coo_matrix.col, coo_matrix.data)
|
35 |
return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
|
36 |
|
37 |
+
def extract_topn_from_vector(feature_names, sorted_items, top_n=10):
|
38 |
+
"""get the feature names and tf-idf score of top n items
|
39 |
+
|
40 |
+
Params
|
41 |
+
---------
|
42 |
+
feature_names: list of words from vectorizer
|
43 |
+
sorted_items: tuple returned by sort_coo function defined in \
|
44 |
+
keyword_extraction.py
|
45 |
+
topn: topn words to be extracted using tfidf
|
46 |
+
|
47 |
+
Return
|
48 |
+
----------
|
49 |
+
results: top extracted keywords
|
50 |
+
|
51 |
+
"""
|
52 |
|
53 |
#use only topn items from vector
|
54 |
+
sorted_items = sorted_items[:top_n]
|
55 |
score_vals = []
|
56 |
feature_vals = []
|
57 |
|
|
|
70 |
return results
|
71 |
|
72 |
def keywordExtraction(sdg:int,sdgdata:List[Text]):
|
73 |
+
"""
|
74 |
+
TFIDF based keywords extraction
|
75 |
+
|
76 |
+
Params
|
77 |
+
---------
|
78 |
+
sdg: which sdg tfidf model to be used
|
79 |
+
sdgdata: text data to which needs keyword extraction
|
80 |
+
|
81 |
+
|
82 |
+
Return
|
83 |
+
----------
|
84 |
+
keywords: top extracted keywords
|
85 |
+
|
86 |
+
"""
|
87 |
model_path = "docStore/sdg{}/".format(sdg)
|
88 |
vectorizer = pickle.load(open(model_path+'vectorizer.pkl', 'rb'))
|
89 |
tfidfmodel = pickle.load(open(model_path+'tfidfmodel.pkl', 'rb'))
|
|
|
95 |
keywords = [keyword for keyword in results]
|
96 |
return keywords
|
97 |
|
98 |
+
def textrank(textdata:Text, ratio:float = 0.1, words = 0):
|
99 |
+
"""
|
100 |
+
wrappper function to perform textrank, uses either ratio or wordcount to
|
101 |
+
extract top keywords limited by words or ratio.
|
102 |
+
|
103 |
+
Params
|
104 |
+
--------
|
105 |
+
textdata: text data to perform the textrank.
|
106 |
+
ratio: float to limit the number of keywords as proportion of total token \
|
107 |
+
in textdata
|
108 |
+
words: number of keywords to be extracted. Takes priority over ratio if \
|
109 |
+
Non zero. Howevr incase the pagerank returns lesser keywords than \
|
110 |
+
compared to fix value then ratio is used.
|
111 |
+
|
112 |
+
"""
|
113 |
if words == 0:
|
114 |
try:
|
115 |
words = int(config.get('sdg','TOP_KEY'))
|
utils/uploadAndExample.py
CHANGED
@@ -31,11 +31,3 @@ def add_upload(choice):
|
|
31 |
file_name = file_path = files[option]
|
32 |
st.session_state['filename'] = file_name
|
33 |
st.session_state['filepath'] = file_path
|
34 |
-
# if option is 'South Africa:Low Emission strategy':
|
35 |
-
# file_name = file_path = 'docStore/sample/South Africa_s Low Emission Development Strategy.txt'
|
36 |
-
# st.session_state['filename'] = file_name
|
37 |
-
# st.session_state['filepath'] = file_path
|
38 |
-
# else:
|
39 |
-
# file_name = file_path = 'docStore/sample/Ethiopia_s_2021_10 Year Development Plan.txt'
|
40 |
-
# st.session_state['filename'] = file_name
|
41 |
-
# st.session_state['filepath'] = file_path
|
|
|
31 |
file_name = file_path = files[option]
|
32 |
st.session_state['filename'] = file_name
|
33 |
st.session_state['filepath'] = file_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|