|
"""This file contains functions that can generates hand-crafted features from the text""" |
|
|
|
__all__ = [ |
|
'count_words', |
|
'count_punctuation', |
|
'count_how_many_words_are_repeating', |
|
'count_misspelled_words', |
|
'preprocess_test' |
|
] |
|
|
|
import re |
|
from collections import Counter |
|
from string import punctuation |
|
from typing import Dict |
|
|
|
from src.spell_checker import SmartSpellChecker |
|
|
|
underscores_to_replace = { |
|
'Generic_Name': 'name', |
|
'OTHER_NAME': 'name', |
|
'STUDENT_NAME': 'name', |
|
'Generic_Namea': 'name', |
|
'PROPER_NAME': 'proper name', |
|
'PROEPR_NAME': 'proper name ', |
|
'Generic_School': 'school', |
|
'SCHOOL_NAME': 'school', |
|
'Generic_school': 'school', |
|
'TEACHER_NAME': 'teacher', |
|
'Generic_City': 'city', |
|
'LOCATION_NAME': 'location', |
|
'STORE_NAME': 'store', |
|
'RESTAURANT_NAME': 'restaurant', |
|
'LANGUAGE_NAME': 'language', |
|
} |
|
|
|
|
|
def preprocess_test(text: str) -> str: |
|
|
|
for key, value in underscores_to_replace.items(): |
|
text = text.replace(key, value) |
|
|
|
text = re.sub(r"[\d%@\\#$&^\"_()*+\-/]", " ", text) |
|
text = re.sub(r"\n|\t", " ", text) |
|
text = re.sub(r'(?<=[.,:;!?])(?=\S)', " ", text) |
|
text = re.sub(r"\s+", " ", text) |
|
|
|
return text |
|
|
|
|
|
def get_word_counter(text: str) -> Dict[str, int]: |
|
|
|
text = re.sub(r"[.,!?;:]", " ", text) |
|
|
|
return Counter(text.split()) |
|
|
|
|
|
def count_punctuation(text: str) -> Dict[str, int]: |
|
features = {} |
|
|
|
for symbol in (punctuation + " "): |
|
features[f'count_{symbol}'] = text.count(symbol) |
|
|
|
return features |
|
|
|
|
|
def count_how_many_words_are_repeating(text: str) -> Dict[str, int]: |
|
word_count = get_word_counter(text) |
|
features = {} |
|
|
|
|
|
for ii in range(3, 10): |
|
n_words_repeated = len([word for word in word_count if word_count[word] >= ii]) |
|
features[f'{ii}_word_repeated'] = n_words_repeated |
|
|
|
return features |
|
|
|
|
|
def count_misspelled_words(text: str, spellcheck: SmartSpellChecker) -> Dict[str, int]: |
|
unknown_words = spellcheck.unknown(get_word_counter(text)) |
|
return {'n_misspelled_words': len(unknown_words)} |
|
|
|
|
|
def count_words(text: str) -> Dict[str, int]: |
|
return {'length': len(get_word_counter(text))} |
|
|