File size: 969 Bytes
e2b1d98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import re
from nltk.tokenize import RegexpTokenizer
import spacy

def remove_patterns(text):
    """
        Remove punctions, emails, hashtags in given text
    """

    if isinstance(text, spacy.tokens.span.Span):
        text = text.text
    # Remove return char
    text = re.sub(r'\n', ' ', text)
    # Remove emails
    text = re.sub(r'\S*@\S*\s?', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    return text

def extract_patterns(text):
    """
        Extract punctions, emails, hashtags in given text
    """
    # extract emails
    emails = re.findall(r'\S+@\S+', text)
    # extract hashtags
    hashtags = re.findall(r'#\w+', text)
    # extract punctuation
    punctuation = re.findall(r'[^\w\s]', text)
    
    return punctuation, emails, hashtags

def remove_punct_nltk(text):
    tokenizer = RegexpTokenizer(r'\w+')
    tokenizer.tokenize(text)
    return text