Spaces:

alpertml
/

TopicModelingForSummarization

Runtime error

File size: 969 Bytes

e2b1d98

import re
from nltk.tokenize import RegexpTokenizer
import spacy

def remove_patterns(text):
    """
        Remove punctions, emails, hashtags in given text
    """

    if isinstance(text, spacy.tokens.span.Span):
        text = text.text
    # Remove return char
    text = re.sub(r'\n', ' ', text)
    # Remove emails
    text = re.sub(r'\S*@\S*\s?', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    return text

def extract_patterns(text):
    """
        Extract punctions, emails, hashtags in given text
    """
    # extract emails
    emails = re.findall(r'\S+@\S+', text)
    # extract hashtags
    hashtags = re.findall(r'#\w+', text)
    # extract punctuation
    punctuation = re.findall(r'[^\w\s]', text)
    
    return punctuation, emails, hashtags

def remove_punct_nltk(text):
    tokenizer = RegexpTokenizer(r'\w+')
    tokenizer.tokenize(text)
    return text