Spaces:

timpan
/

summary-simi-check4qee

Build error

summary-simi-check4qee / lex_rank_L12.py

hellopahe

remove redundancy

94692cf about 1 year ago

No virus

1.51 kB

	import numpy, nltk
	nltk.download('punkt')


	from harvesttext import HarvestText
	from lex_rank_util import degree_centrality_scores, find_siblings_by_index
	from sentence_transformers import SentenceTransformer, util


	class LexRankL12(object):
	def __init__(self):
	self.model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
	self.ht = HarvestText()

	def find_central(self, content: str, num=10, siblings=0):
	if self.contains_chinese(content):
	sentences = self.ht.cut_sentences(content)
	else:
	sentences = nltk.sent_tokenize(content)
	embeddings = self.model.encode(sentences, convert_to_tensor=True).cpu()

	# Compute the pair-wise cosine similarities
	cos_scores = util.cos_sim(embeddings, embeddings).numpy()

	# Compute the centrality for each sentence
	centrality_scores = degree_centrality_scores(cos_scores, threshold=None)

	# We argsort so that the first element is the sentence with the highest score
	most_central_sentence_indices = numpy.argsort(-centrality_scores)

	central_and_siblings = find_siblings_by_index(sentences, most_central_sentence_indices, siblings, num)
	res = []
	for index in central_and_siblings:
	res.append(sentences[index])
	return res

	def contains_chinese(self, content: str):
	for _char in content:
	if '\u4e00' <= _char <= '\u9fa5':
	return True
	return False