Gladiator commited on
Commit
32ff21e
1 Parent(s): 097245e

fixed cleaning text bugs

Browse files
Files changed (2) hide show
  1. app.py +9 -2
  2. src/utils.py +3 -0
app.py CHANGED
@@ -28,11 +28,13 @@ if __name__ == "__main__":
28
  "Summarization type", options=["Extractive", "Abstractive"]
29
  )
30
  # ---------------------------
31
- # SETUP
32
  nltk.download("punkt")
33
  abs_tokenizer_name = "t5-base"
34
  abs_model_name = "t5-base"
35
  abs_tokenizer = T5Tokenizer.from_pretrained(abs_tokenizer_name)
 
 
36
  # ---------------------------
37
 
38
  inp_text = st.text_input("Enter text or a url here")
@@ -81,7 +83,12 @@ if __name__ == "__main__":
81
  tokenizer=abs_tokenizer, text=clean_txt
82
  )
83
  print(text_to_summarize)
84
- tmp_sum = abs_summarizer(text_to_summarize, do_sample=False)
 
 
 
 
 
85
 
86
  summarized_text = " ".join([summ["summary_text"] for summ in tmp_sum])
87
 
 
28
  "Summarization type", options=["Extractive", "Abstractive"]
29
  )
30
  # ---------------------------
31
+ # SETUP & Constants
32
  nltk.download("punkt")
33
  abs_tokenizer_name = "t5-base"
34
  abs_model_name = "t5-base"
35
  abs_tokenizer = T5Tokenizer.from_pretrained(abs_tokenizer_name)
36
+ abs_max_length = 80
37
+ abs_min_length = 30
38
  # ---------------------------
39
 
40
  inp_text = st.text_input("Enter text or a url here")
 
83
  tokenizer=abs_tokenizer, text=clean_txt
84
  )
85
  print(text_to_summarize)
86
+ tmp_sum = abs_summarizer(
87
+ text_to_summarize,
88
+ max_length=abs_max_length,
89
+ min_length=abs_min_length,
90
+ do_sample=False,
91
+ )
92
 
93
  summarized_text = " ".join([summ["summary_text"] for summ in tmp_sum])
94
 
src/utils.py CHANGED
@@ -38,6 +38,9 @@ def fetch_article_text(url: str):
38
  results = soup.find_all(["h1", "p"])
39
  text = [result.text for result in results]
40
  ARTICLE = " ".join(text)
 
 
 
41
  sentences = ARTICLE.split("<eos>")
42
  current_chunk = 0
43
  chunks = []
 
38
  results = soup.find_all(["h1", "p"])
39
  text = [result.text for result in results]
40
  ARTICLE = " ".join(text)
41
+ ARTICLE = ARTICLE.replace(".", ".<eos>")
42
+ ARTICLE = ARTICLE.replace("!", "!<eos>")
43
+ ARTICLE = ARTICLE.replace("?", "?<eos>")
44
  sentences = ARTICLE.split("<eos>")
45
  current_chunk = 0
46
  chunks = []