Pendrokar's picture
ionite34's h2p_parser and dep required for English
2080fde
raw
history blame
No virus
4.29 kB
import nltk
import re
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag
from nltk import pos_tag_sents
from .dictionary import Dictionary
from .filter import filter_text as ft
from . import format_ph as ph
# Check that the nltk data is downloaded, if not, download it
try:
nltk.data.find('taggers/averaged_perceptron_tagger.zip')
except LookupError:
nltk.download('averaged_perceptron_tagger')
# Method to use Regex to replace the first instance of a word with its phonemes
def replace_first(target, replacement, text):
# Skip if target invalid
if target is None or target == '':
return text
# Replace the first instance of a word with its phonemes
return re.sub(r'(?i)\b' + target + r'\b', replacement, text, 1)
class H2p:
def __init__(self, dict_path=None, preload=False, phoneme_format=''):
"""
Creates a H2p parser
Supported phoneme formats:
- Space delimited
- Space delimited surrounded by { }
:param dict_path: Path to a heteronym dictionary json file. Built-in dictionary will be used if None
:type dict_path: str
:param preload: Preloads the tokenizer and tagger during initialization
:type preload: bool
"""
# Supported phoneme formats
self.phoneme_format = phoneme_format
self.dict = Dictionary(dict_path)
self.tokenize = TweetTokenizer().tokenize
self.get_tags = pos_tag
if preload:
self.preload()
# Method to preload tokenizer and pos_tag
def preload(self):
tokens = self.tokenize('a')
assert tokens == ['a']
assert pos_tag(tokens)[0][0] == 'a'
# Method to check if a text line contains a heteronym
def contains_het(self, text):
# Filter the text
text = ft(text)
# Tokenize
words = self.tokenize(text)
# Check match with dictionary
hets = []
for word in words:
if self.dict.contains(word):
hets.append(word)
return len(hets)>0, hets
# Method to replace heteronyms in a text line to phonemes
def replace_het(self, text):
# Filter the text
working_text = ft(text, preserve_case=True)
# Tokenize
words = self.tokenize(working_text)
# Get pos tags
tags = pos_tag(words)
# Loop through words and pos tags
for word, pos in tags:
# Skip if word not in dictionary
if not self.dict.contains(word):
continue
# Get phonemes
phonemes = self.dict.get_phoneme(word, pos)
# Format phonemes
f_ph = ph.with_cb(ph.to_sds(phonemes))
# Replace word with phonemes
text = replace_first(word, f_ph, text)
return text
# Replaces heteronyms in a list of text lines
# Slightly faster than replace_het() called on each line
def replace_het_list(self, text_list):
# Filter the text
working_text_list = [ft(text, preserve_case=True) for text in text_list]
# Tokenize
list_sentence_words = [self.tokenize(text) for text in working_text_list]
# Get pos tags list
tags_list = pos_tag_sents(list_sentence_words)
# Loop through lines
for index in range(len(tags_list)):
# Loop through words and pos tags in tags_list index
for word, pos in tags_list[index]:
# Skip if word not in dictionary
if not self.dict.contains(word):
continue
# Get phonemes
phonemes = self.dict.get_phoneme(word, pos)
# Format phonemes
f_ph = ph.with_cb(ph.to_sds(phonemes))
# Replace word with phonemes
text_list[index] = replace_first(word, f_ph, text_list[index])
return text_list
# Method to tag a text line, returns a list of tags
def tag(self, text):
# Filter the text
working_text = ft(text, preserve_case=True)
# Tokenize
words = self.tokenize(working_text)
# Get pos tags
tags = pos_tag(words)
# Only return element 1 of each list
return [tag[1] for tag in tags]