add url support for summarization
Browse files- app.py +18 -7
- src/utils.py +29 -0
app.py
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
import torch
|
|
|
2 |
import streamlit as st
|
3 |
-
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
4 |
|
5 |
# local modules
|
6 |
from extractive_summarizer.model_processors import Summarizer
|
7 |
-
from src.utils import clean_text
|
8 |
from src.abstractive_summarizer import abstractive_summarizer
|
9 |
|
10 |
-
|
11 |
# abstractive summarizer model
|
12 |
@st.cache()
|
13 |
def load_abs_model():
|
@@ -25,9 +25,14 @@ if __name__ == "__main__":
|
|
25 |
"Summarization type", options=["Extractive", "Abstractive"]
|
26 |
)
|
27 |
|
28 |
-
inp_text = st.text_input("Enter
|
29 |
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
# view summarized text (expander)
|
33 |
with st.expander("View input text"):
|
@@ -44,7 +49,7 @@ if __name__ == "__main__":
|
|
44 |
text="Creating extractive summary. This might take a few seconds ..."
|
45 |
):
|
46 |
ext_model = Summarizer()
|
47 |
-
summarized_text = ext_model(
|
48 |
|
49 |
elif summarize_type == "Abstractive":
|
50 |
with st.spinner(
|
@@ -52,8 +57,14 @@ if __name__ == "__main__":
|
|
52 |
):
|
53 |
abs_tokenizer, abs_model = load_abs_model()
|
54 |
summarized_text = abstractive_summarizer(
|
55 |
-
abs_tokenizer, abs_model,
|
56 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
# final summarized output
|
59 |
st.subheader("Summarized text")
|
|
|
1 |
import torch
|
2 |
+
import validators
|
3 |
import streamlit as st
|
4 |
+
from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
|
5 |
|
6 |
# local modules
|
7 |
from extractive_summarizer.model_processors import Summarizer
|
8 |
+
from src.utils import clean_text, fetch_article_text
|
9 |
from src.abstractive_summarizer import abstractive_summarizer
|
10 |
|
|
|
11 |
# abstractive summarizer model
|
12 |
@st.cache()
|
13 |
def load_abs_model():
|
|
|
25 |
"Summarization type", options=["Extractive", "Abstractive"]
|
26 |
)
|
27 |
|
28 |
+
inp_text = st.text_input("Enter text or a url here")
|
29 |
|
30 |
+
is_url = validators.url(inp_text)
|
31 |
+
if is_url:
|
32 |
+
# complete text, chunks to summarize (list of sentences for long docs)
|
33 |
+
text, text_to_summarize = fetch_article_text(url=inp_text)
|
34 |
+
else:
|
35 |
+
text_to_summarize = clean_text(inp_text)
|
36 |
|
37 |
# view summarized text (expander)
|
38 |
with st.expander("View input text"):
|
|
|
49 |
text="Creating extractive summary. This might take a few seconds ..."
|
50 |
):
|
51 |
ext_model = Summarizer()
|
52 |
+
summarized_text = ext_model(text_to_summarize, num_sentences=6)
|
53 |
|
54 |
elif summarize_type == "Abstractive":
|
55 |
with st.spinner(
|
|
|
57 |
):
|
58 |
abs_tokenizer, abs_model = load_abs_model()
|
59 |
summarized_text = abstractive_summarizer(
|
60 |
+
abs_tokenizer, abs_model, text_to_summarize
|
61 |
)
|
62 |
+
elif summarize_type == "Abstractive" and is_url:
|
63 |
+
abs_url_summarizer = pipeline("summarization")
|
64 |
+
tmp_sum = abs_url_summarizer(
|
65 |
+
text_to_summarize, max_length=120, min_length=30, do_sample=False
|
66 |
+
)
|
67 |
+
summarized_text = " ".join([summ["summary_text"] for summ in tmp_sum])
|
68 |
|
69 |
# final summarized output
|
70 |
st.subheader("Summarized text")
|
src/utils.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
import re
|
|
|
|
|
2 |
|
3 |
emoji_pattern = re.compile(
|
4 |
"["
|
@@ -27,3 +29,30 @@ def clean_text(x):
|
|
27 |
x = re.sub("[^A-Za-z0-9]+", " ", x) # special charachters
|
28 |
|
29 |
return x
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import re
|
2 |
+
import requests
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
|
5 |
emoji_pattern = re.compile(
|
6 |
"["
|
|
|
29 |
x = re.sub("[^A-Za-z0-9]+", " ", x) # special charachters
|
30 |
|
31 |
return x
|
32 |
+
|
33 |
+
|
34 |
+
def fetch_article_text(url: str):
|
35 |
+
|
36 |
+
r = requests.get(url)
|
37 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
38 |
+
results = soup.find_all(["h1", "p"])
|
39 |
+
text = [result.text for result in results]
|
40 |
+
ARTICLE = " ".join(text)
|
41 |
+
sentences = ARTICLE.split("<eos>")
|
42 |
+
current_chunk = 0
|
43 |
+
chunks = []
|
44 |
+
for sentence in sentences:
|
45 |
+
if len(chunks) == current_chunk + 1:
|
46 |
+
if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
|
47 |
+
chunks[current_chunk].extend(sentence.split(" "))
|
48 |
+
else:
|
49 |
+
current_chunk += 1
|
50 |
+
chunks.append(sentence.split(" "))
|
51 |
+
else:
|
52 |
+
print(current_chunk)
|
53 |
+
chunks.append(sentence.split(" "))
|
54 |
+
|
55 |
+
for chunk_id in range(len(chunks)):
|
56 |
+
chunks[chunk_id] = " ".join(chunks[chunk_id])
|
57 |
+
|
58 |
+
return ARTICLE, chunks
|