Ulpan commited on
Commit
7a96f34
1 Parent(s): 204bf55

Upload 10 files

Browse files
app.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ #from src.summary import summarize
4
+ #from src.bert2 import get_summary
5
+ #from src.mbart import predictions
6
+ from src.bert3 import predictions
7
+
8
+
9
+ if __name__ == '__main__':
10
+
11
+ st.header("Text Summarization using BERT")
12
+ st.subheader("This app will summarize the long piece of input text in a few sentences")
13
+
14
+ st.subheader("Paste your long text below:")
15
+ text = st.text_area(label="Input text")
16
+ if st.button("Summarize"):
17
+ if text:
18
+ summary_result = predictions(text)
19
+ st.success(summary_result)
20
+
21
+
22
+ else:
23
+ st.error("Please paste or write(!) some text")
24
+
25
+
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ streamlit
2
+ pandas
src/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
src/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (149 Bytes). View file
 
src/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (147 Bytes). View file
 
src/__pycache__/bert3.cpython-38.pyc ADDED
Binary file (1.1 kB). View file
 
src/__pycache__/summary.cpython-310.pyc ADDED
Binary file (2.18 kB). View file
 
src/__pycache__/summary.cpython-38.pyc ADDED
Binary file (2.18 kB). View file
 
src/bert3.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #import razdel
2
+ #import torch
3
+ #from datasets import load_dataset
4
+ import pandas as pd
5
+ import numpy as np
6
+ #import gensim
7
+ #from tqdm.auto import tqdm
8
+ from transformers import AutoTokenizer, EncoderDecoderModel
9
+
10
+ model_name = "IlyaGusev/rubert_telegram_headlines"
11
+ tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False, do_basic_tokenize=False, strip_accents=False)
12
+ model = EncoderDecoderModel.from_pretrained(model_name)
13
+
14
+ def get_summary(article_text):
15
+ input_ids = tokenizer(
16
+ [article_text],
17
+ add_special_tokens=True,
18
+ max_length=256,
19
+ padding="max_length",
20
+ truncation=True,
21
+ return_tensors="pt",
22
+ )["input_ids"]
23
+
24
+ output_ids = model.generate(
25
+ input_ids=input_ids,
26
+ max_length=64,
27
+ no_repeat_ngram_size=3,
28
+ num_beams=10,
29
+ top_p=0.95
30
+ )[0]
31
+
32
+ headline = tokenizer.decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
33
+ return headline
34
+
35
+ def predictions(text):
36
+ summary = get_summary(text)
37
+ return summary
src/summary.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def get_sentences(txt):
4
+ return txt.split('.')
5
+
6
+ def get_words(txt):
7
+ only_words_text = re.compile(r'[^0-9^a-z^A-Z\s]').sub('',txt)
8
+ return only_words_text.split(' ')
9
+
10
+ def get_keywords(word_list , min_ratio=0.001, max_ratio=0.5) :
11
+ """ this method takes a word list and returns a set of keywords """
12
+ assert (min_ratio < 1 and max_ratio < 1)
13
+ count_dict = {}
14
+ for word in word_list:
15
+ count_dict.setdefault(word , 0)
16
+ count_dict[word] +=1
17
+ keywords = set()
18
+ for word , cnt in count_dict.items():
19
+ word_percentage = count_dict[word]* 1.0 / len (word_list)
20
+ if word_percentage <= max_ratio and word_percentage >=min_ratio:
21
+ keywords.add(word)
22
+ return keywords
23
+
24
+ def get_sentence_weight (sentence , keywords):
25
+ """ this method takes a sentence string and a set of keywords and returns weight of the sentence """
26
+ sen_list = sentence.split(' ')
27
+ window_start = 0; window_end = -1;
28
+ #calculating window start
29
+ for i in range(len(sen_list)):
30
+ if sen_list[i] in keywords:
31
+ window_start = i
32
+ break
33
+ #calculating window end
34
+ for i in range(len(sen_list) - 1 , 0 , -1) :
35
+ if sen_list[i] in keywords:
36
+ window_end = i
37
+ break
38
+ if window_start > window_end :
39
+ return 0
40
+ window_size = window_end - window_start + 1
41
+ #calculating number of keywords
42
+ keywords_cnt =0
43
+ for w in sen_list :
44
+ if w in keywords:
45
+ keywords_cnt +=1
46
+ return keywords_cnt*keywords_cnt *1.0 / window_size
47
+
48
+ def summarize(text):
49
+ txt = text.replace('\n','')
50
+ word_list = get_words(txt)
51
+ keywords = get_keywords(word_list , 0.05 , 0.5)
52
+ sentence_list = get_sentences(txt)
53
+ sentence_weight = {}
54
+ for sen in sentence_list:
55
+ sentence_weight[sen] = get_sentence_weight(sen, keywords)
56
+ top_sentences = list(sentence_list) # make a copy
57
+ top_sentences.sort(key=lambda x: sentence_weight[x], reverse=True) # sort by score
58
+ top_sentences = top_sentences[:int(len(sentence_weight)*0.2)] # get a part
59
+ top_sentences.sort(key=lambda x: sentence_list.index(x)) # sort by occurrence
60
+ summary = '. '.join(top_sentences)
61
+ return summary