Spaces:
Runtime error
Runtime error
add clean text func
Browse files- src/utils.py +29 -0
src/utils.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
emoji_pattern = re.compile(
|
4 |
+
"["
|
5 |
+
u"\U0001F600-\U0001F64F" # emoticons
|
6 |
+
u"\U0001F300-\U0001F5FF" # symbols & pictographs
|
7 |
+
u"\U0001F680-\U0001F6FF" # transport & map symbols
|
8 |
+
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
|
9 |
+
u"\U00002702-\U000027B0"
|
10 |
+
u"\U000024C2-\U0001F251"
|
11 |
+
"]+",
|
12 |
+
flags=re.UNICODE,
|
13 |
+
)
|
14 |
+
|
15 |
+
|
16 |
+
def clean_text(x):
|
17 |
+
x = x.lower() # lowercase
|
18 |
+
x = x.encode("ascii", "ignore").decode() # unicode
|
19 |
+
x = re.sub(r"https*\S+", " ", x) # url
|
20 |
+
x = re.sub(r"@\S+", " ", x) # mentions
|
21 |
+
x = re.sub(r"#\S+", " ", x) # hastags
|
22 |
+
x = x.replace("'", "") # remove ticks
|
23 |
+
# x = re.sub("[%s]" % re.escape(string.punctuation), " ", x) # punctuation
|
24 |
+
# x = re.sub(r"\w*\d+\w*", "", x) # numbers
|
25 |
+
x = re.sub(r"\s{2,}", " ", x) # over spaces
|
26 |
+
x = emoji_pattern.sub(r"", x) # emojis
|
27 |
+
x = re.sub("[^A-Za-z0-9]+", " ", x) # special charachters
|
28 |
+
|
29 |
+
return x
|