File size: 4,308 Bytes
fc3b461 0e0caa9 fc3b461 0e0caa9 5bc4948 0e0caa9 fc3b461 53e0cf4 fc3b461 53e0cf4 fc3b461 53e0cf4 fc3b461 b114d3b fc3b461 fc140bc 9f55059 fc140bc d7ce857 53e0cf4 fc3b461 0e0caa9 d7ce857 0e0caa9 f59362a 9f55059 53e0cf4 98746bf 53e0cf4 98746bf 53e0cf4 5bc4948 fc140bc 5bc4948 11e64f9 0e0caa9 5bc4948 fc3b461 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import pandas as pd
# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
# import nltk
# nltk.download('stopwords')
# from nltk.corpus import stopwords
import pickle
from typing import List, Text
import logging
from summa import keywords
try:
import streamlit as st
except ImportError:
logging.info("Streamlit not installed")
def sort_coo(coo_matrix):
"""
It takes Coordinate format scipy sparse matrix and extracts info from same.\
1. https://kavita-ganesan.com/python-keyword-extraction/#.Y2-TFHbMJPb
"""
tuples = zip(coo_matrix.col, coo_matrix.data)
return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
def extract_topn_from_vector(feature_names, sorted_items, top_n=10):
"""get the feature names and tf-idf score of top n items
Params
---------
feature_names: list of words from vectorizer
sorted_items: tuple returned by sort_coo function defined in \
keyword_extraction.py
topn: topn words to be extracted using tfidf
Return
----------
results: top extracted keywords
"""
#use only topn items from vector
sorted_items = sorted_items[:top_n]
score_vals = []
feature_vals = []
# word index and corresponding tf-idf score
for idx, score in sorted_items:
#keep track of feature name and its corresponding score
score_vals.append(round(score, 3))
feature_vals.append(feature_names[idx])
results= {}
for idx in range(len(feature_vals)):
results[feature_vals[idx]]=score_vals[idx]
return results
def tfidf_keyword(textdata:str, vectorizer, tfidfmodel, top_n):
"""
TFIDF based keywords extraction
Params
---------
vectorizer: trained cont vectorizer model
tfidfmodel: TFIDF Tranformer model
top_n: Top N keywords to be extracted
textdata: text data to which needs keyword extraction
Return
----------
keywords: top extracted keywords
"""
features = vectorizer.get_feature_names_out()
tf_idf_vector=tfidfmodel.transform(vectorizer.transform(textdata))
sorted_items=sort_coo(tf_idf_vector.tocoo())
results=extract_topn_from_vector(features,sorted_items,top_n)
keywords = [keyword for keyword in results]
return keywords
def keyword_extraction(sdg:int,sdgdata:List[Text], top_n:int=10):
"""
TFIDF based keywords extraction
Params
---------
sdg: which sdg tfidf model to be used
sdgdata: text data to which needs keyword extraction
Return
----------
keywords: top extracted keywords
"""
model_path = "docStore/sdg{}/".format(sdg)
vectorizer = pickle.load(open(model_path+'vectorizer.pkl', 'rb'))
tfidfmodel = pickle.load(open(model_path+'tfidfmodel.pkl', 'rb'))
features = vectorizer.get_feature_names_out()
tf_idf_vector=tfidfmodel.transform(vectorizer.transform(sdgdata))
sorted_items=sort_coo(tf_idf_vector.tocoo())
top_n = top_n
results=extract_topn_from_vector(features,sorted_items,top_n)
keywords = [keyword for keyword in results]
return keywords
@st.cache(allow_output_mutation=True)
def textrank(textdata:Text, ratio:float = 0.1, words:int = 0)->List[str]:
"""
wrappper function to perform textrank, uses either ratio or wordcount to
extract top keywords limited by words or ratio.
1. https://github.com/summanlp/textrank/blob/master/summa/keywords.py
Params
--------
textdata: text data to perform the textrank.
ratio: float to limit the number of keywords as proportion of total token \
in textdata
words: number of keywords to be extracted. Takes priority over ratio if \
Non zero. Howevr incase the pagerank returns lesser keywords than \
compared to fix value then ratio is used.
Return
--------
results: extracted keywords
"""
if words == 0:
logging.info("Textrank using defulat ratio value = 0.1, as no words limit given")
results = keywords.keywords(textdata, ratio= ratio).split("\n")
else:
try:
results = keywords.keywords(textdata, words= words).split("\n")
except:
results = keywords.keywords(textdata, ratio = ratio).split("\n")
return results
|