theekshana
commited on
Commit
•
c293aab
1
Parent(s):
a4f8505
pegasus
Browse files- app.py +159 -235
- config.py +4 -3
- summarizer.py +54 -53
app.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1 |
import datetime
|
|
|
|
|
2 |
import logging
|
3 |
import nltk
|
4 |
import validators
|
5 |
import streamlit as st
|
6 |
-
from summarizer import
|
7 |
from config import MODELS
|
8 |
from warnings import filterwarnings
|
9 |
|
@@ -15,91 +17,141 @@ from utils import (
|
|
15 |
read_text_from_file,
|
16 |
)
|
17 |
|
|
|
|
|
18 |
|
19 |
-
from rouge import Rouge
|
20 |
-
|
21 |
-
# def filer():
|
22 |
-
# # return "logs/log "
|
23 |
-
# today = datetime.datetime.today()
|
24 |
-
# log_filename = f"logs/{today.year}-{today.month:02d}-{today.day:02d}.log"
|
25 |
-
# return log_filename
|
26 |
-
|
27 |
-
# file_handler = logging.FileHandler(filer())
|
28 |
-
# # file_handler = logging.handlers.TimedRotatingFileHandler(filer(),when="D")
|
29 |
-
# file_handler.setLevel(logging.INFO)
|
30 |
-
|
31 |
-
# logging.basicConfig(
|
32 |
-
# level=logging.DEBUG,
|
33 |
-
# format="%(asctime)s %(levelname)s (%(name)s) : %(message)s",
|
34 |
-
# datefmt="%Y-%m-%d %H:%M:%S",
|
35 |
-
# handlers=[file_handler],
|
36 |
-
# force=True,
|
37 |
-
# )
|
38 |
|
39 |
logger = logging.getLogger(__name__)
|
40 |
|
41 |
-
|
42 |
-
if "api_key" not in st.session_state:
|
43 |
-
st.session_state.api_key = " "
|
44 |
-
|
45 |
-
|
46 |
-
@st.cache_resource
|
47 |
def initialize_app():
|
48 |
nltk.download("punkt")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
-
|
51 |
-
|
|
|
52 |
|
53 |
-
|
54 |
-
if model_name == "OpenAI":
|
55 |
-
model_type = "openai"
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
-
return Summarizer(model_path,model_type,api_key)
|
62 |
-
else:
|
63 |
-
logger.info(f"Model for summarization : {model_path}")
|
64 |
-
return Summarizer(model_path, model_type)
|
65 |
|
66 |
def load_app():
|
67 |
st.title("Text Summarizer 📝")
|
68 |
|
69 |
-
# st.markdown("Creator: [Atharva Ingle](https://github.com/Gladiator07)")
|
70 |
-
# st.markdown(
|
71 |
-
# "Source code: [GitHub Repository](https://github.com/Gladiator07/Text-Summarizer)"
|
72 |
-
# )
|
73 |
-
model_name = st.sidebar.selectbox(
|
74 |
-
"Model Name", options=["Version 0", "Version 1","OpenAI"]
|
75 |
-
)
|
76 |
-
if model_name == "OpenAI":
|
77 |
-
st.sidebar.text_input("Enter a valid OpenAI API Key",key = "api_key" ,type="password")
|
78 |
-
|
79 |
-
summarizer_type = st.sidebar.selectbox(
|
80 |
-
"Summarizer Type for Long Text", options=["Map Reduce", "Refine"]
|
81 |
-
)
|
82 |
-
|
83 |
-
st.markdown(
|
84 |
-
"Enter a text or a url to get a concise summary of the article while conserving the overall meaning. This app supports text in the following formats:"
|
85 |
-
)
|
86 |
-
st.markdown(
|
87 |
-
"""- Raw text in text box
|
88 |
-
- URL of article/news to be summarized
|
89 |
-
- .txt, .pdf, .docx file formats"""
|
90 |
-
)
|
91 |
-
st.markdown(
|
92 |
-
"""This app supports abstractive summarization of documents:
|
93 |
-
|
94 |
-
**Abstractive Summarization**: The abstractive approach involves rephrasing the complete document while capturing the complete meaning of the document. This type of summarization provides more human-like summary"""
|
95 |
-
)
|
96 |
-
st.markdown("---")
|
97 |
-
# ---------------------------
|
98 |
-
|
99 |
-
# ---------------------------
|
100 |
inp_text = st.text_input("Enter text or a url here")
|
101 |
st.markdown(
|
102 |
-
"<
|
103 |
unsafe_allow_html=True,
|
104 |
)
|
105 |
uploaded_file = st.file_uploader(
|
@@ -125,51 +177,44 @@ def load_app():
|
|
125 |
st.write(cleaned_txt[0])
|
126 |
else:
|
127 |
st.write(cleaned_txt)
|
128 |
-
summarize = st.button("Summarize")
|
129 |
-
|
130 |
-
if is_url:
|
131 |
-
text_to_summarize = " ".join([txt for txt in cleaned_txt])
|
132 |
-
else:
|
133 |
-
text_to_summarize = cleaned_txt
|
134 |
-
|
135 |
-
return text_to_summarize, model_name, summarizer_type, summarize
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
def get_summary(text_to_summarize,model_name, summarizer_type, summarize):
|
141 |
-
|
142 |
-
while not summarize:
|
143 |
-
continue
|
144 |
-
|
145 |
-
else:
|
146 |
|
147 |
-
|
148 |
-
logger.info(f"Summarization Type for Long Text: {summarizer_type}")
|
149 |
|
150 |
-
|
|
|
|
|
|
|
|
|
151 |
|
|
|
152 |
|
153 |
-
|
|
|
|
|
154 |
|
155 |
|
156 |
-
|
157 |
-
|
158 |
-
|
|
|
159 |
|
160 |
-
|
161 |
-
|
162 |
-
return summarized_text, time
|
163 |
-
else :
|
164 |
-
summarized_text, time = summarizer.summarize(text_to_summarize,"map_reduce")
|
165 |
-
return summarized_text, time
|
166 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
|
168 |
-
|
169 |
|
170 |
def display_output(summarized_text,time):
|
171 |
-
|
172 |
-
|
173 |
logger.info(f"SUMMARY: {summarized_text}")
|
174 |
logger.info(f"Summary took {time}s")
|
175 |
st.subheader("Summarized text")
|
@@ -177,140 +222,19 @@ def display_output(summarized_text,time):
|
|
177 |
st.info(f"Time: {time}s")
|
178 |
|
179 |
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
# # st.markdown("Creator: [Atharva Ingle](https://github.com/Gladiator07)")
|
187 |
-
# # st.markdown(
|
188 |
-
# # "Source code: [GitHub Repository](https://github.com/Gladiator07/Text-Summarizer)"
|
189 |
-
# # )
|
190 |
-
# model_name = st.sidebar.selectbox(
|
191 |
-
# "Model Name", options=["Version 0", "Version 1","OpenAI"]
|
192 |
-
# )
|
193 |
-
# if model_name == "OpenAI":
|
194 |
-
# st.sidebar.text_input("Enter a valid OpenAI API Key",key = "api_key" ,type="password")
|
195 |
-
|
196 |
-
# summarizer_type = st.sidebar.selectbox(
|
197 |
-
# "Summarizer Type for Long Text", options=["Map Reduce", "Refine"]
|
198 |
-
# )
|
199 |
-
|
200 |
-
# st.markdown(
|
201 |
-
# "Enter a text or a url to get a concise summary of the article while conserving the overall meaning. This app supports text in the following formats:"
|
202 |
-
# )
|
203 |
-
# st.markdown(
|
204 |
-
# """- Raw text in text box
|
205 |
-
# - URL of article/news to be summarized
|
206 |
-
# - .txt, .pdf, .docx file formats"""
|
207 |
-
# )
|
208 |
-
# st.markdown(
|
209 |
-
# """This app supports two type of summarization:
|
210 |
-
|
211 |
-
# 1. **Extractive Summarization**: The extractive approach involves picking up the most important phrases and lines from the documents. It then combines all the important lines to create the summary. So, in this case, every line and word of the summary actually belongs to the original document which is summarized.
|
212 |
-
# 2. **Abstractive Summarization**: The abstractive approach involves rephrasing the complete document while capturing the complete meaning of the document. This type of summarization provides more human-like summary"""
|
213 |
-
# )
|
214 |
-
# st.markdown("---")
|
215 |
-
# # ---------------------------
|
216 |
-
# # SETUP & Constants
|
217 |
-
# # nltk.download("punkt")
|
218 |
-
# # abs_tokenizer_name = "facebook/bart-large-cnn"
|
219 |
-
# # abs_model_name = "facebook/bart-large-cnn"
|
220 |
-
# # abs_tokenizer = AutoTokenizer.from_pretrained(abs_tokenizer_name)
|
221 |
-
# # abs_max_length = 90
|
222 |
-
# # abs_min_length = 30
|
223 |
-
|
224 |
-
# # model_name_v0 = "IronOne-AI-Labs/long-t5-tglobal-16k-annual-reports-v0"
|
225 |
-
# # model_name_v1 = "IronOne-AI-Labs/long-t5-tglobal-16k-annual-reports-v1"
|
226 |
-
# # ---------------------------
|
227 |
-
# inp_text = st.text_input("Enter text or a url here")
|
228 |
-
# st.markdown(
|
229 |
-
# "<h3 style='text-align: center; color: green;'>OR</h3>",
|
230 |
-
# unsafe_allow_html=True,
|
231 |
-
# )
|
232 |
-
# uploaded_file = st.file_uploader(
|
233 |
-
# "Upload a .txt, .pdf, .docx file for summarization"
|
234 |
-
# )
|
235 |
-
|
236 |
-
# is_url = validators.url(inp_text)
|
237 |
-
# if is_url:
|
238 |
-
# # complete text, chunks to summarize (list of sentences for long docs)
|
239 |
-
# logger.info("Text Input Type: URL")
|
240 |
-
# text, cleaned_txt = fetch_article_text(url=inp_text)
|
241 |
-
# elif uploaded_file:
|
242 |
-
# logger.info("Text Input Type: FILE")
|
243 |
-
# cleaned_txt = read_text_from_file(uploaded_file)
|
244 |
-
# cleaned_txt = clean_text(cleaned_txt)
|
245 |
-
# else:
|
246 |
-
# logger.info("Text Input Type: INPUT TEXT")
|
247 |
-
# cleaned_txt = clean_text(inp_text)
|
248 |
-
|
249 |
-
# # view summarized text (expander)
|
250 |
-
# with st.expander("View input text"):
|
251 |
-
# if is_url:
|
252 |
-
# st.write(cleaned_txt[0])
|
253 |
-
# else:
|
254 |
-
# st.write(cleaned_txt)
|
255 |
-
# summarize = st.button("Summarize")
|
256 |
-
|
257 |
-
# # called on toggle button [summarize]
|
258 |
-
# if summarize:
|
259 |
-
# if is_url:
|
260 |
-
# text_to_summarize = " ".join([txt for txt in cleaned_txt])
|
261 |
-
# else:
|
262 |
-
# text_to_summarize = cleaned_txt
|
263 |
-
|
264 |
-
# logger.info(f"Model Name: {model_name}")
|
265 |
-
# logger.info(f"Summarization Type for Long Text: {summarizer_type}")
|
266 |
-
|
267 |
-
# api_key = st.session_state.api_key
|
268 |
-
|
269 |
-
# print(api_key)
|
270 |
-
|
271 |
-
# summarizer = init_summarizer(model_name,api_key)
|
272 |
-
|
273 |
-
# with st.spinner(
|
274 |
-
# text="Creating summary. This might take a few seconds ..."
|
275 |
-
# ):
|
276 |
-
# #ext_model = Summarizer()
|
277 |
-
# #summarized_text = ext_model(text_to_summarize, num_sentences=5)
|
278 |
-
|
279 |
-
# if summarizer_type == "Refine":
|
280 |
-
# summarized_text, time = summarizer.summarize(text_to_summarize,"refine")
|
281 |
-
# else :
|
282 |
-
# summarized_text, time = summarizer.summarize(text_to_summarize,"map_reduce")
|
283 |
-
|
284 |
-
|
285 |
-
# # elif model_name == "Version 1":
|
286 |
-
# # with st.spinner(
|
287 |
-
# # text="Creating summary. This might take a few seconds ..."
|
288 |
-
# # ):
|
289 |
-
# # if summarizer_type == "Refine":
|
290 |
-
# # summarized_text, time = summarizer_v1.summarize(text_to_summarize,"refine")
|
291 |
-
# # else :
|
292 |
-
# # summarized_text, time = summarizer_v1.summarize(text_to_summarize,"map_reduce")
|
293 |
-
|
294 |
-
# # final summarized output
|
295 |
-
|
296 |
-
# logger.info(f"SUMMARY: {summarized_text}")
|
297 |
-
# logger.info(f"Summary took {time}s")
|
298 |
-
# st.subheader("Summarized text")
|
299 |
-
# st.info(f"{summarized_text}")
|
300 |
-
# st.info(f"Time: {time}s")
|
301 |
-
|
302 |
-
# # st.subheader("Rogue Scores")
|
303 |
-
# # rouge_sc = Rouge()
|
304 |
-
# # ground_truth = cleaned_txt[0] if is_url else cleaned_txt
|
305 |
-
# # score = rouge_sc.get_scores(summarized_text, ground_truth, avg=True)
|
306 |
-
# # st.code(score)
|
307 |
|
308 |
|
309 |
if __name__ == "__main__":
|
310 |
-
|
311 |
-
text_to_summarize, model_name, summarizer_type, summarize = load_app()
|
312 |
-
summarized_text,time = get_summary(text_to_summarize, model_name, summarizer_type, summarize)
|
313 |
-
display_output(summarized_text,time)
|
314 |
|
315 |
|
316 |
|
|
|
1 |
import datetime
|
2 |
+
import os
|
3 |
+
import time
|
4 |
import logging
|
5 |
import nltk
|
6 |
import validators
|
7 |
import streamlit as st
|
8 |
+
from summarizer import summarizer_init, summarizer_summarize
|
9 |
from config import MODELS
|
10 |
from warnings import filterwarnings
|
11 |
|
|
|
17 |
read_text_from_file,
|
18 |
)
|
19 |
|
20 |
+
# summarizer = None
|
21 |
+
# from rouge import Rouge
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
logger = logging.getLogger(__name__)
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
def initialize_app():
|
27 |
nltk.download("punkt")
|
28 |
+
SESSION_DEFAULTS = {
|
29 |
+
"model_type": "local",
|
30 |
+
"model_name": "long-t5 v1",
|
31 |
+
"summarizer_type": "Map Reduce",
|
32 |
+
"is_parameters_changed":False,
|
33 |
+
# "user_question":'',
|
34 |
+
'openai_api_key':'',
|
35 |
+
}
|
36 |
|
37 |
+
for k, v in SESSION_DEFAULTS.items():
|
38 |
+
if k not in st.session_state:
|
39 |
+
st.session_state[k] = v
|
40 |
|
41 |
+
# init_summarizer(st.session_state.model_name,api_key=None)
|
|
|
|
|
42 |
|
43 |
+
@st.cache_resource
|
44 |
+
def init_summarizer(model_name,api_key=None):
|
45 |
+
with st.spinner(
|
46 |
+
text="initialising the summarizer. This might take a few seconds ..."
|
47 |
+
):
|
48 |
+
model_type = "local"
|
49 |
+
if model_name == "OpenAI":
|
50 |
+
model_type = "openai"
|
51 |
+
|
52 |
+
model_path = MODELS[model_name]
|
53 |
+
if model_type == "openai":
|
54 |
+
#validation logic
|
55 |
+
api_key = st.session_state.openai_api_key
|
56 |
+
tokenizer,base_summarizer = summarizer_init(model_path,model_type,api_key)
|
57 |
+
else:
|
58 |
+
logger.info(f"Model for summarization : {model_path}")
|
59 |
+
tokenizer,base_summarizer = summarizer_init(model_path, model_type)
|
60 |
+
|
61 |
+
alert = st.success("summarizer initialised")
|
62 |
+
time.sleep(1) # Wait for 1 seconds
|
63 |
+
alert.empty() # Clear the alert
|
64 |
+
return model_type, tokenizer, base_summarizer
|
65 |
+
|
66 |
+
def update_parameters_change():
|
67 |
+
st.session_state.is_parameters_changed = True
|
68 |
+
|
69 |
+
|
70 |
+
def parameters_change_button(model_name, summarizer_type):
|
71 |
+
st.session_state.model_name = model_name
|
72 |
+
st.session_state.summarizer_type = summarizer_type
|
73 |
+
st.session_state.is_parameters_changed = False
|
74 |
+
# init_summarizer(model_name,api_key=None)
|
75 |
+
alert = st.success("chat parameters updated")
|
76 |
+
time.sleep(2) # Wait for 1 seconds
|
77 |
+
alert.empty() # Clear the alert
|
78 |
+
|
79 |
+
import re
|
80 |
+
def is_valid_open_ai_api_key(secretKey):
|
81 |
+
if re.search("^sk-[a-zA-Z0-9]{32,}$", secretKey ):
|
82 |
+
return True
|
83 |
+
else: return False
|
84 |
+
|
85 |
+
def side_bar():
|
86 |
+
with st.sidebar:
|
87 |
+
st.subheader("Model parameters")
|
88 |
+
|
89 |
+
with st.form('param_form'):
|
90 |
+
# st.info('Info: use openai chat model for best results')
|
91 |
+
model_name = st.selectbox(
|
92 |
+
"Chat model",
|
93 |
+
MODELS,
|
94 |
+
# options=["long-t5 v0", "long-t5 v1", "pegasus-x-large v1", "OpenAI"],
|
95 |
+
key="Model Name",
|
96 |
+
help="Select the LLM model for summarization",
|
97 |
+
# on_change=update_parameters_change,
|
98 |
+
)
|
99 |
+
|
100 |
+
summarizer_type = st.selectbox(
|
101 |
+
"Summarizer Type for Long Text", options=["Map Reduce", "Refine"]
|
102 |
+
)
|
103 |
+
|
104 |
+
submitted = st.form_submit_button(
|
105 |
+
"Save Parameters",
|
106 |
+
# on_click=update_parameters_change
|
107 |
+
)
|
108 |
+
|
109 |
+
if submitted:
|
110 |
+
parameters_change_button(model_name, summarizer_type)
|
111 |
+
|
112 |
+
|
113 |
+
st.markdown("\n")
|
114 |
+
if st.session_state.model_name == 'openai':
|
115 |
+
with st.form('openai api key'):
|
116 |
+
api_key = st.text_input(
|
117 |
+
"Enter openai api key",
|
118 |
+
type="password",
|
119 |
+
value=st.session_state.openai_api_key,
|
120 |
+
help="enter an openai api key created from 'https://platform.openai.com/account/api-keys'",
|
121 |
+
)
|
122 |
+
|
123 |
+
submit_key = st.form_submit_button(
|
124 |
+
"Save key",
|
125 |
+
# on_click=update_parameters_change
|
126 |
+
)
|
127 |
+
|
128 |
+
if submit_key:
|
129 |
+
st.session_state.openai_api_key = api_key
|
130 |
+
# st.text(st.session_state.openai_api_key)
|
131 |
+
alert = st.success("openai api key updated")
|
132 |
+
time.sleep(1) # Wait for 3 seconds
|
133 |
+
alert.empty() # Clear the alert
|
134 |
+
st.markdown(
|
135 |
+
"### How to use\n"
|
136 |
+
"1. Select the LLM model\n" # noqa: E501
|
137 |
+
"1. If selected model asks for a api key enter a valid api key.\n" # noqa: E501
|
138 |
+
"1. Enter a text or a url to get a summary."
|
139 |
+
)
|
140 |
+
st.markdown("---")
|
141 |
+
st.markdown("""
|
142 |
+
This app supports text in the following formats:
|
143 |
+
- Raw text in text box
|
144 |
+
- URL of article/news to be summarized
|
145 |
+
- .txt, .pdf, .docx file formats
|
146 |
+
""")
|
147 |
|
|
|
|
|
|
|
|
|
148 |
|
149 |
def load_app():
|
150 |
st.title("Text Summarizer 📝")
|
151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
inp_text = st.text_input("Enter text or a url here")
|
153 |
st.markdown(
|
154 |
+
"<h4 style='text-align: center; color: green;'>OR</h4>",
|
155 |
unsafe_allow_html=True,
|
156 |
)
|
157 |
uploaded_file = st.file_uploader(
|
|
|
177 |
st.write(cleaned_txt[0])
|
178 |
else:
|
179 |
st.write(cleaned_txt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
|
181 |
+
submitted = st.button("Summarize")
|
|
|
182 |
|
183 |
+
if submitted:
|
184 |
+
if is_url:
|
185 |
+
text_to_summarize = " ".join([txt for txt in cleaned_txt])
|
186 |
+
else:
|
187 |
+
text_to_summarize = cleaned_txt
|
188 |
|
189 |
+
submit_text_to_summarize(text_to_summarize)
|
190 |
|
191 |
+
def submit_text_to_summarize(text_to_summarize):
|
192 |
+
summarized_text, time = get_summary(text_to_summarize)
|
193 |
+
display_output(summarized_text,time)
|
194 |
|
195 |
|
196 |
+
def get_summary(text_to_summarize):
|
197 |
+
model_name = st.session_state.model_name
|
198 |
+
summarizer_type = st.session_state.summarizer_type
|
199 |
+
model_type, tokenizer, base_summarizer = init_summarizer(model_name,api_key=None)
|
200 |
|
201 |
+
logger.info(f"Model Name: {model_name}")
|
202 |
+
logger.info(f"Summarization Type for Long Text: {summarizer_type}")
|
|
|
|
|
|
|
|
|
203 |
|
204 |
+
with st.spinner(
|
205 |
+
text="Creating summary. This might take a few seconds ..."
|
206 |
+
):
|
207 |
+
if summarizer_type == "Refine":
|
208 |
+
# summarized_text, time = summarizer.summarize(text_to_summarize,"refine")
|
209 |
+
summarized_text, time = summarizer_summarize(model_type,tokenizer, base_summarizer, text_to_summarize ,summarizer_type = "refine")
|
210 |
+
return summarized_text, time
|
211 |
+
else :
|
212 |
+
# summarized_text, time = summarizer.summarize(text_to_summarize,"map_reduce")
|
213 |
+
summarized_text, time = summarizer_summarize(model_type,tokenizer, base_summarizer, text_to_summarize ,summarizer_type = "map_reduce")
|
214 |
+
return summarized_text, time
|
215 |
|
|
|
216 |
|
217 |
def display_output(summarized_text,time):
|
|
|
|
|
218 |
logger.info(f"SUMMARY: {summarized_text}")
|
219 |
logger.info(f"Summary took {time}s")
|
220 |
st.subheader("Summarized text")
|
|
|
222 |
st.info(f"Time: {time}s")
|
223 |
|
224 |
|
225 |
+
def main():
|
226 |
+
|
227 |
+
initialize_app()
|
228 |
+
side_bar()
|
229 |
+
load_app()
|
230 |
+
# chat_body()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
|
232 |
|
233 |
if __name__ == "__main__":
|
234 |
+
main()
|
235 |
+
# text_to_summarize, model_name, summarizer_type, summarize = load_app()
|
236 |
+
# summarized_text,time = get_summary(text_to_summarize, model_name, summarizer_type, summarize)
|
237 |
+
# display_output(summarized_text,time)
|
238 |
|
239 |
|
240 |
|
config.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
MODELS = {
|
2 |
-
"
|
3 |
-
"
|
4 |
-
"
|
|
|
5 |
}
|
|
|
1 |
MODELS = {
|
2 |
+
"long-t5 v0":"IronOne-AI-Labs/long-t5-tglobal-16k-annual-reports-v0",
|
3 |
+
"long-t5 v1":"IronOne-AI-Labs/long-t5-tglobal-16k-annual-reports-v1",
|
4 |
+
"pegasus-x-large v1" : "IronOne-AI-Labs/pegasus-x-large-annual-report-QLoRA-fine-tuned-v1.1", #for tokenizer
|
5 |
+
"openai" : "IronOne-AI-Labs/pegasus-x-large-annual-report-QLoRA-fine-tuned-v1.1"
|
6 |
}
|
summarizer.py
CHANGED
@@ -8,65 +8,66 @@ from logging import getLogger
|
|
8 |
import time
|
9 |
|
10 |
logger = getLogger(__name__)
|
11 |
-
class Summarizer:
|
12 |
|
13 |
|
14 |
-
def __init__(self,model_name,model_type,api_key=None) -> None:
|
15 |
-
self.model_type = model_type
|
16 |
-
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
17 |
-
self.base_summarizer = get_model(model_type,model_name,api_key)
|
18 |
|
19 |
-
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
|
22 |
|
23 |
-
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
|
|
|
|
60 |
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
llm_chain = get_map_reduce_chain(self.base_summarizer,model_type=self.model_type)
|
66 |
-
logger.info("Running Map Reduce Chain for Summarization")
|
67 |
-
start = time.time()
|
68 |
-
summary = llm_chain.invoke({"input_documents": text_to_summarize}, return_only_outputs=True)['output_text']
|
69 |
-
end = time.time()
|
70 |
-
print(f"Summary generation took {round((end-start),2)}s.")
|
71 |
-
return summary,round((end-start),2)
|
72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
import time
|
9 |
|
10 |
logger = getLogger(__name__)
|
|
|
11 |
|
12 |
|
|
|
|
|
|
|
|
|
13 |
|
14 |
+
def summarizer_init(model_name,model_type,api_key=None) -> None:
|
15 |
+
# model_type = model_type
|
16 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
17 |
+
base_summarizer = get_model(model_type,model_name,api_key)
|
18 |
+
return tokenizer,base_summarizer
|
19 |
|
20 |
+
def summarizer_summarize(model_type,tokenizer, base_summarizer, text:str,summarizer_type = "map_reduce")->str:
|
21 |
|
22 |
+
text_to_summarize,length_type = prepare_for_summarize(text,tokenizer)
|
23 |
|
24 |
+
if length_type =="short":
|
25 |
+
|
26 |
+
logger.info("Processing Input Text less than 12000 Tokens")
|
27 |
+
if model_type=="openai":
|
28 |
+
llm = base_summarizer
|
29 |
+
prompt = PromptTemplate.from_template(
|
30 |
+
template="""Write a concise and complete summary in bullet points of the given annual report.
|
31 |
+
Important:
|
32 |
+
* Note that the summary should contain all important information and it should not contain any unwanted information.
|
33 |
+
* Make sure to keep the summary as short as possible. And Summary should be in bullet points. Seperate each point with a new line.
|
34 |
+
TEXT: {text}
|
35 |
+
SUMMARY:"""
|
36 |
+
)
|
37 |
+
llm_chain = prompt|llm
|
38 |
+
start = time.time()
|
39 |
+
summary = llm_chain.invoke({"text": text_to_summarize})
|
40 |
+
end = time.time()
|
41 |
+
print(f"Summary generation took {round((end-start),2)}s.")
|
42 |
+
return summary,round((end-start),2)
|
43 |
+
|
44 |
+
elif model_type == "local":
|
45 |
+
pipe = base_summarizer
|
46 |
+
start = time.time()
|
47 |
+
summary = pipe(text_to_summarize)[0]['summary_text']
|
48 |
+
end = time.time()
|
49 |
+
print(f"Summary generation took {round((end-start),2)}s.")
|
50 |
+
return summary,round((end-start),2)
|
51 |
+
else:
|
52 |
+
if summarizer_type == "refine":
|
53 |
+
print("The text is too long, Running Refine Summarizer")
|
54 |
+
llm_chain = get_refine_chain(base_summarizer,model_type)
|
55 |
+
logger.info("Running Refine Chain for Summarization")
|
56 |
+
start = time.time()
|
57 |
+
summary = llm_chain.invoke({"input_documents": text_to_summarize}, return_only_outputs=True)['output_text']
|
58 |
+
end = time.time()
|
59 |
+
print(f"Summary generation took {round((end-start),2)}s.")
|
60 |
+
return summary,round((end-start),2)
|
61 |
|
62 |
|
63 |
+
else:
|
64 |
+
print("The text is too long, Running Map Reduce Summarizer")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
+
llm_chain = get_map_reduce_chain(base_summarizer,model_type=model_type)
|
67 |
+
logger.info("Running Map Reduce Chain for Summarization")
|
68 |
+
start = time.time()
|
69 |
+
summary = llm_chain.invoke({"input_documents": text_to_summarize}, return_only_outputs=True)['output_text']
|
70 |
+
end = time.time()
|
71 |
+
print(f"Summary generation took {round((end-start),2)}s.")
|
72 |
+
return summary,round((end-start),2)
|
73 |
+
|