Spaces:

timpan
/

summary-simi-check4qee

Build error

App Files Files Community

hellopahe commited on Aug 2, 2023

Commit

e7699c1

•

1 Parent(s): 90f83ff

add lexrank

Browse files

Files changed (4) hide show

LexRank.py +124 -0
app.py +50 -8
article_extractor/tokenizers_pegasus.py +1 -1
requirements.txt +5 -1

LexRank.py ADDED Viewed

	@@ -0,0 +1,124 @@

+"""
+LexRank implementation
+Source: https://github.com/crabcamp/lexrank/tree/dev
+"""
+import numpy as np
+from scipy.sparse.csgraph import connected_components
+from scipy.special import softmax
+import logging
+logger = logging.getLogger(__name__)
+def degree_centrality_scores(
+    similarity_matrix,
+    threshold=None,
+    increase_power=True,
+):
+    if not (
+        threshold is None
+        or isinstance(threshold, float)
+        and 0 <= threshold < 1
+    ):
+        raise ValueError(
+            '\'threshold\' should be a floating-point number '
+            'from the interval [0, 1) or None',
+        )
+    if threshold is None:
+        markov_matrix = create_markov_matrix(similarity_matrix)
+    else:
+        markov_matrix = create_markov_matrix_discrete(
+            similarity_matrix,
+            threshold,
+        )
+    scores = stationary_distribution(
+        markov_matrix,
+        increase_power=increase_power,
+        normalized=False,
+    )
+    return scores
+def _power_method(transition_matrix, increase_power=True, max_iter=10000):
+    eigenvector = np.ones(len(transition_matrix))
+    if len(eigenvector) == 1:
+        return eigenvector
+    transition = transition_matrix.transpose()
+    for _ in range(max_iter):
+        eigenvector_next = np.dot(transition, eigenvector)
+        if np.allclose(eigenvector_next, eigenvector):
+            return eigenvector_next
+        eigenvector = eigenvector_next
+        if increase_power:
+            transition = np.dot(transition, transition)
+    logger.warning("Maximum number of iterations for power method exceeded without convergence!")
+    return eigenvector_next
+def connected_nodes(matrix):
+    _, labels = connected_components(matrix)
+    groups = []
+    for tag in np.unique(labels):
+        group = np.where(labels == tag)[0]
+        groups.append(group)
+    return groups
+def create_markov_matrix(weights_matrix):
+    n_1, n_2 = weights_matrix.shape
+    if n_1 != n_2:
+        raise ValueError('\'weights_matrix\' should be square')
+    row_sum = weights_matrix.sum(axis=1, keepdims=True)
+    # normalize probability distribution differently if we have negative transition values
+    if np.min(weights_matrix) <= 0:
+        return softmax(weights_matrix, axis=1)
+    return weights_matrix / row_sum
+def create_markov_matrix_discrete(weights_matrix, threshold):
+    discrete_weights_matrix = np.zeros(weights_matrix.shape)
+    ixs = np.where(weights_matrix >= threshold)
+    discrete_weights_matrix[ixs] = 1
+    return create_markov_matrix(discrete_weights_matrix)
+def stationary_distribution(
+    transition_matrix,
+    increase_power=True,
+    normalized=True,
+):
+    n_1, n_2 = transition_matrix.shape
+    if n_1 != n_2:
+        raise ValueError('\'transition_matrix\' should be square')
+    distribution = np.zeros(n_1)
+    grouped_indices = connected_nodes(transition_matrix)
+    for group in grouped_indices:
+        t_matrix = transition_matrix[np.ix_(group, group)]
+        eigenvector = _power_method(t_matrix, increase_power=increase_power)
+        distribution[group] = eigenvector
+    if normalized:
+        distribution /= n_1
+    return distribution

app.py CHANGED Viewed

@@ -8,6 +8,10 @@ from embed import Embed
 import tensorflow as tf
 class SummaryExtractor(object):
     def __init__(self):
@@ -16,16 +20,51 @@ class SummaryExtractor(object):
         self.tokenizer = PegasusTokenizer.from_pretrained("IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese")
         self.text2text_genr = Text2TextGenerationPipeline(self.model, self.tokenizer, device=self.device)
-    def extract(self, content: str, min=20, max=30) -> str:
-        return str(self.text2text_genr(content, do_sample=False, min_length=min, max_length=max, num_return_sequences=3)[0]["generated_text"])
 t_randeng = SummaryExtractor()
 embedder = Embed()
 def randeng_extract(content):
-    return t_randeng.extract(content)
 def similarity_check(inputs: list):
@@ -42,13 +81,13 @@ with gr.Blocks() as app:
     #     text_output = gr.Textbox()
     #     text_button = gr.Button("生成摘要")
     with gr.Tab("Randeng-Pegasus-523M"):
-        text_input_1 = gr.Textbox()
-        text_output_1 = gr.Textbox()
         text_button_1 = gr.Button("生成摘要")
     with gr.Tab("相似度检测"):
         with gr.Row():
-            text_input_query = gr.Textbox()
-            text_input_doc = gr.Textbox()
         text_button_similarity = gr.Button("对比相似度")
         text_output_similarity = gr.Textbox()
@@ -56,4 +95,7 @@ with gr.Blocks() as app:
     text_button_1.click(randeng_extract, inputs=text_input_1, outputs=text_output_1)
     text_button_similarity.click(similarity_check, inputs=[text_input_query, text_input_doc], outputs=text_output_similarity)
-app.launch()

 import tensorflow as tf
+from harvesttext import HarvestText
+from sentence_transformers import SentenceTransformer, util
+from LexRank import degree_centrality_scores
 class SummaryExtractor(object):
     def __init__(self):
         self.tokenizer = PegasusTokenizer.from_pretrained("IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese")
         self.text2text_genr = Text2TextGenerationPipeline(self.model, self.tokenizer, device=self.device)
+    def extract(self, content: str) -> str:
+        print(content)
+        return str(self.text2text_genr(content, do_sample=False, num_return_sequences=3)[0]["generated_text"])
+class LexRank(object):
+    def __init__(self):
+        self.model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
+        self.ht = HarvestText()
+    def find_central(self, content: str):
+        sentences = self.ht.cut_sentences(content)
+        embeddings = self.model.encode(sentences, convert_to_tensor=True)
+        # Compute the pair-wise cosine similarities
+        cos_scores = util.cos_sim(embeddings, embeddings).numpy()
+        # Compute the centrality for each sentence
+        centrality_scores = degree_centrality_scores(cos_scores, threshold=None)
+        # We argsort so that the first element is the sentence with the highest score
+        most_central_sentence_indices = numpy.argsort(-centrality_scores)
+        return most_central_sentence_indices
+# ---===--- worker instances ---===---
 t_randeng = SummaryExtractor()
 embedder = Embed()
+lex = LexRank()
 def randeng_extract(content):
+    sentences = lex.find_central(content)
+    num = 500
+    ptr = 0
+    for index, sentence in enumerate(sentences):
+        num -= len(sentence)
+        if num < 0 and index > 0:
+            ptr = index - 1
+            break
+        if num < 0 and index == 0:
+            ptr = index
+            break
+    print(">>>")
+    for ele in sentences[:ptr]:
+        print(ele)
+    return t_randeng.extract("".join(sentences[:ptr]))
 def similarity_check(inputs: list):
     #     text_output = gr.Textbox()
     #     text_button = gr.Button("生成摘要")
     with gr.Tab("Randeng-Pegasus-523M"):
+        text_input_1 = gr.Textbox(label="请输入长文本:", max_lines=1000)
+        text_output_1 = gr.Textbox(label="摘要文本")
         text_button_1 = gr.Button("生成摘要")
     with gr.Tab("相似度检测"):
         with gr.Row():
+            text_input_query = gr.Textbox(label="查询文本")
+            text_input_doc = gr.Textbox(lines=10, label="逐行输入待比较的文本列表")
         text_button_similarity = gr.Button("对比相似度")
         text_output_similarity = gr.Textbox()
     text_button_1.click(randeng_extract, inputs=text_input_1, outputs=text_output_1)
     text_button_similarity.click(similarity_check, inputs=[text_input_query, text_input_doc], outputs=text_output_similarity)
+app.launch(
+    # share=True,
+    # debug=True
+           )

article_extractor/tokenizers_pegasus.py CHANGED Viewed

@@ -20,7 +20,7 @@ import sys
 # sys.path.append("../../../../")
 jieba.dt.tmp_dir = os.path.expanduser(
-    "../tmp/")
 # jieba.enable_parallel(8)
 jieba.initialize()

 # sys.path.append("../../../../")
 jieba.dt.tmp_dir = os.path.expanduser(
+    "tmp/")
 # jieba.enable_parallel(8)
 jieba.initialize()

requirements.txt CHANGED Viewed

@@ -11,4 +11,8 @@ jieba
 deepspeed
 jieba-fast
 protobuf
-datasets

 deepspeed
 jieba-fast
 protobuf
+datasets
+gradio
+sentence-transformers