piecurus commited on
Commit
5c492e2
1 Parent(s): 41519cf
Files changed (1) hide show
  1. utils.py +137 -0
utils.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import requests
3
+ import docx2txt
4
+ from io import StringIO
5
+ from PyPDF2 import PdfFileReader
6
+
7
+ from bs4 import BeautifulSoup
8
+ from nltk.tokenize import sent_tokenize
9
+
10
+ emoji_pattern = re.compile(
11
+ "["
12
+ u"\U0001F600-\U0001F64F" # emoticons
13
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
14
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
15
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
16
+ u"\U00002702-\U000027B0"
17
+ u"\U000024C2-\U0001F251"
18
+ "]+",
19
+ flags=re.UNICODE,
20
+ )
21
+
22
+
23
+ def clean_text(x):
24
+ # x = x.lower() # lowercase
25
+ x = x.encode("ascii", "ignore").decode() # unicode
26
+ x = re.sub(r"https*\S+", " ", x) # url
27
+ x = re.sub(r"@\S+", " ", x) # mentions
28
+ x = re.sub(r"#\S+", " ", x) # hastags
29
+ # x = x.replace("'", "") # remove ticks
30
+ # x = re.sub("[%s]" % re.escape(string.punctuation), " ", x) # punctuation
31
+ # x = re.sub(r"\w*\d+\w*", "", x) # numbers
32
+ x = re.sub(r"\s{2,}", " ", x) # over spaces
33
+ x = emoji_pattern.sub(r"", x) # emojis
34
+ x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) # special charachters except .,!?
35
+
36
+ return x
37
+
38
+
39
+ def fetch_article_text(url: str):
40
+
41
+ r = requests.get(url)
42
+ soup = BeautifulSoup(r.text, "html.parser")
43
+ results = soup.find_all(["h1", "p"])
44
+ text = [result.text for result in results]
45
+ ARTICLE = " ".join(text)
46
+ ARTICLE = ARTICLE.replace(".", ".<eos>")
47
+ ARTICLE = ARTICLE.replace("!", "!<eos>")
48
+ ARTICLE = ARTICLE.replace("?", "?<eos>")
49
+ sentences = ARTICLE.split("<eos>")
50
+ current_chunk = 0
51
+ chunks = []
52
+ for sentence in sentences:
53
+ if len(chunks) == current_chunk + 1:
54
+ if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
55
+ chunks[current_chunk].extend(sentence.split(" "))
56
+ else:
57
+ current_chunk += 1
58
+ chunks.append(sentence.split(" "))
59
+ else:
60
+ print(current_chunk)
61
+ chunks.append(sentence.split(" "))
62
+
63
+ for chunk_id in range(len(chunks)):
64
+ chunks[chunk_id] = " ".join(chunks[chunk_id])
65
+
66
+ return ARTICLE, chunks
67
+
68
+
69
+ def preprocess_text_for_abstractive_summarization(tokenizer, text):
70
+ sentences = sent_tokenize(text)
71
+
72
+ # initialize
73
+ length = 0
74
+ chunk = ""
75
+ chunks = []
76
+ count = -1
77
+ for sentence in sentences:
78
+ count += 1
79
+ combined_length = (
80
+ len(tokenizer.tokenize(sentence)) + length
81
+ ) # add the no. of sentence tokens to the length counter
82
+
83
+ if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed
84
+ chunk += sentence + " " # add the sentence to the chunk
85
+ length = combined_length # update the length counter
86
+
87
+ # if it is the last sentence
88
+ if count == len(sentences) - 1:
89
+ chunks.append(chunk.strip()) # save the chunk
90
+
91
+ else:
92
+ chunks.append(chunk.strip()) # save the chunk
93
+
94
+ # reset
95
+ length = 0
96
+ chunk = ""
97
+
98
+ # take care of the overflow sentence
99
+ chunk += sentence + " "
100
+ length = len(tokenizer.tokenize(sentence))
101
+
102
+ return chunks
103
+
104
+
105
+ def read_pdf(file):
106
+ pdfReader = PdfFileReader(file)
107
+ count = pdfReader.numPages
108
+ all_page_text = ""
109
+ for i in range(count):
110
+ page = pdfReader.getPage(i)
111
+ all_page_text += page.extractText()
112
+
113
+ return all_page_text
114
+
115
+
116
+ def read_text_from_file(file):
117
+
118
+ # read text file
119
+ if file.type == "text/plain":
120
+ # To convert to a string based IO:
121
+ stringio = StringIO(file.getvalue().decode("utf-8"))
122
+
123
+ # To read file as string:
124
+ file_content = stringio.read()
125
+
126
+ # read pdf file
127
+ elif file.type == "application/pdf":
128
+ file_content = read_pdf(file)
129
+
130
+ # read docx file
131
+ elif (
132
+ file.type
133
+ == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
134
+ ):
135
+ file_content = docx2txt.process(file)
136
+
137
+ return file_content