Upload 10 files

Browse files

Files changed (10) hide show

app.py +25 -0
requirements.txt +2 -0
src/__init__.py +1 -0
src/__pycache__/__init__.cpython-310.pyc +0 -0
src/__pycache__/__init__.cpython-38.pyc +0 -0
src/__pycache__/bert3.cpython-38.pyc +0 -0
src/__pycache__/summary.cpython-310.pyc +0 -0
src/__pycache__/summary.cpython-38.pyc +0 -0
src/bert3.py +37 -0
src/summary.py +61 -0

app.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import streamlit as st
+#from src.summary import summarize
+#from src.bert2 import get_summary
+#from src.mbart import predictions
+from src.bert3 import predictions
+if __name__ == '__main__':
+    st.header("Text Summarization using BERT")
+    st.subheader("This app will summarize the long piece of input text in a few sentences")
+    st.subheader("Paste your long text below:")
+    text = st.text_area(label="Input text")
+    if st.button("Summarize"):
+        if text:
+            summary_result = predictions(text)
+            st.success(summary_result)
+        else:
+            st.error("Please paste or write(!) some text")

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ streamlit
2	+ pandas

src/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

src/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (149 Bytes). View file

src/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (147 Bytes). View file

src/__pycache__/bert3.cpython-38.pyc ADDED Viewed

Binary file (1.1 kB). View file

src/__pycache__/summary.cpython-310.pyc ADDED Viewed

Binary file (2.18 kB). View file

src/__pycache__/summary.cpython-38.pyc ADDED Viewed

Binary file (2.18 kB). View file

src/bert3.py ADDED Viewed

	@@ -0,0 +1,37 @@

+#import razdel
+#import torch
+#from datasets import load_dataset
+import pandas as pd
+import numpy as np
+#import gensim
+#from tqdm.auto import tqdm
+from transformers import AutoTokenizer, EncoderDecoderModel
+model_name = "IlyaGusev/rubert_telegram_headlines"
+tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False, do_basic_tokenize=False, strip_accents=False)
+model = EncoderDecoderModel.from_pretrained(model_name)
+def get_summary(article_text):
+    input_ids = tokenizer(
+        [article_text],
+        add_special_tokens=True,
+        max_length=256,
+        padding="max_length",
+        truncation=True,
+        return_tensors="pt",
+    )["input_ids"]
+    output_ids = model.generate(
+        input_ids=input_ids,
+        max_length=64,
+        no_repeat_ngram_size=3,
+        num_beams=10,
+        top_p=0.95
+    )[0]
+    headline = tokenizer.decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+    return headline
+def predictions(text):
+    summary = get_summary(text)
+    return summary

src/summary.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import re
+def get_sentences(txt):
+    return txt.split('.')
+def get_words(txt):
+    only_words_text = re.compile(r'[^0-9^a-z^A-Z\s]').sub('',txt)
+    return only_words_text.split(' ')
+def get_keywords(word_list , min_ratio=0.001, max_ratio=0.5) :
+    """ this method takes a word list and returns a set of keywords """
+    assert (min_ratio < 1 and max_ratio < 1)
+    count_dict = {}
+    for word in word_list:
+        count_dict.setdefault(word , 0)
+        count_dict[word] +=1
+    keywords = set()
+    for word , cnt in count_dict.items():
+        word_percentage = count_dict[word]* 1.0 / len (word_list)
+        if word_percentage <= max_ratio and word_percentage >=min_ratio:
+            keywords.add(word)
+    return keywords
+def get_sentence_weight (sentence , keywords):
+    """ this method takes a sentence string and a set of keywords and returns weight of the sentence """
+    sen_list = sentence.split(' ')
+    window_start = 0; window_end = -1;
+    #calculating window start
+    for i in range(len(sen_list)):
+        if sen_list[i] in keywords:
+            window_start = i
+            break
+    #calculating window end
+    for i in range(len(sen_list) - 1 , 0 , -1) :
+        if sen_list[i] in keywords:
+            window_end = i
+            break
+    if window_start > window_end :
+        return 0
+    window_size = window_end - window_start + 1
+    #calculating number of keywords
+    keywords_cnt =0
+    for w in sen_list :
+        if w in keywords:
+            keywords_cnt +=1
+    return keywords_cnt*keywords_cnt *1.0 / window_size
+def summarize(text):
+    txt = text.replace('\n','')
+    word_list = get_words(txt)
+    keywords = get_keywords(word_list , 0.05 , 0.5)
+    sentence_list = get_sentences(txt)
+    sentence_weight = {}
+    for sen in sentence_list:
+        sentence_weight[sen] = get_sentence_weight(sen, keywords)
+    top_sentences = list(sentence_list)                                # make a copy
+    top_sentences.sort(key=lambda x: sentence_weight[x], reverse=True)      # sort by score
+    top_sentences = top_sentences[:int(len(sentence_weight)*0.2)] # get a part
+    top_sentences.sort(key=lambda x: sentence_list.index(x))           # sort by occurrence
+    summary = '. '.join(top_sentences)
+    return summary