Spaces:
Runtime error
Runtime error
File size: 969 Bytes
e2b1d98 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
import re
from nltk.tokenize import RegexpTokenizer
import spacy
def remove_patterns(text):
"""
Remove punctions, emails, hashtags in given text
"""
if isinstance(text, spacy.tokens.span.Span):
text = text.text
# Remove return char
text = re.sub(r'\n', ' ', text)
# Remove emails
text = re.sub(r'\S*@\S*\s?', '', text)
# Remove hashtags
text = re.sub(r'#\w+', '', text)
# Remove punctuation
text = re.sub(r'[^\w\s]', '', text)
return text
def extract_patterns(text):
"""
Extract punctions, emails, hashtags in given text
"""
# extract emails
emails = re.findall(r'\S+@\S+', text)
# extract hashtags
hashtags = re.findall(r'#\w+', text)
# extract punctuation
punctuation = re.findall(r'[^\w\s]', text)
return punctuation, emails, hashtags
def remove_punct_nltk(text):
tokenizer = RegexpTokenizer(r'\w+')
tokenizer.tokenize(text)
return text |