File size: 2,381 Bytes
fc3b461 0e0caa9 fc3b461 0e0caa9 5bc4948 0e0caa9 fc3b461 0e0caa9 fc3b461 0e0caa9 5bc4948 0e0caa9 5bc4948 fc3b461 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import pandas as pd
# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
# import nltk
# nltk.download('stopwords')
# from nltk.corpus import stopwords
import pickle
from typing import List, Text
import configparser
import logging
from summa import keywords
try:
from termcolor import colored
except:
pass
try:
import streamlit as st
except ImportError:
logging.info("Streamlit not installed")
config = configparser.ConfigParser()
try:
config.read_file(open('paramconfig.cfg'))
except Exception:
logging.warning("paramconfig file not found")
st.info("Please place the paramconfig file in the same directory as app.py")
def sort_coo(coo_matrix):
tuples = zip(coo_matrix.col, coo_matrix.data)
return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
"""get the feature names and tf-idf score of top n items"""
#use only topn items from vector
sorted_items = sorted_items[:topn]
score_vals = []
feature_vals = []
# word index and corresponding tf-idf score
for idx, score in sorted_items:
#keep track of feature name and its corresponding score
score_vals.append(round(score, 3))
feature_vals.append(feature_names[idx])
#create a tuples of feature,score
#results = zip(feature_vals,score_vals)
results= {}
for idx in range(len(feature_vals)):
results[feature_vals[idx]]=score_vals[idx]
return results
def keywordExtraction(sdg:int,sdgdata:List[Text]):
model_path = "docStore/sdg{}/".format(sdg)
vectorizer = pickle.load(open(model_path+'vectorizer.pkl', 'rb'))
tfidfmodel = pickle.load(open(model_path+'tfidfmodel.pkl', 'rb'))
features = vectorizer.get_feature_names_out()
tf_idf_vector=tfidfmodel.transform(vectorizer.transform(sdgdata))
sorted_items=sort_coo(tf_idf_vector.tocoo())
top_n = int(config.get('tfidf', 'TOP_N'))
results=extract_topn_from_vector(features,sorted_items,top_n)
keywords = [keyword for keyword in results]
return keywords
def textrank(textdata, ratio = 0.1, words = 0):
if words == 0:
results = keywords.keywords(textdata, ratio= ratio).split("\n")
else:
results = keywords.keywords(textdata, words= words).split("\n")
return results
|