Spaces:
Build error
Build error
File size: 1,518 Bytes
3611e07 94692cf 3611e07 6a0cb69 3611e07 6a0cb69 3611e07 6a0cb69 3611e07 94692cf 3611e07 94692cf 3611e07 e4369a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
import numpy, nltk
nltk.download('punkt')
from harvesttext import HarvestText
from lex_rank_util import degree_centrality_scores, find_siblings_by_index
from sentence_transformers import SentenceTransformer, util
class LexRankText2VecV1(object):
def __init__(self):
self.model = SentenceTransformer('shibing624/text2vec-base-chinese-paraphrase')
self.ht = HarvestText()
def find_central(self, content: str, num=10, siblings=0):
if self.contains_chinese(content):
sentences = self.ht.cut_sentences(content)
else:
sentences = nltk.sent_tokenize(content)
embeddings = self.model.encode(sentences, convert_to_tensor=True).cpu()
# Compute the pair-wise cosine similarities
cos_scores = util.cos_sim(embeddings, embeddings).numpy()
# Compute the centrality for each sentence
centrality_scores = degree_centrality_scores(cos_scores, threshold=None)
# We argsort so that the first element is the sentence with the highest score
most_central_sentence_indices = numpy.argsort(-centrality_scores)
central_and_siblings = find_siblings_by_index(sentences, most_central_sentence_indices, siblings, num)
res = []
for index in central_and_siblings:
res.append(sentences[index])
return res
def contains_chinese(self, content: str):
for _char in content:
if '\u4e00' <= _char <= '\u9fa5':
return True
return False
|