Pendrokar's picture
ionite34's h2p_parser and dep required for English
2080fde
raw
history blame
10 kB
# Extended Grapheme to Phoneme conversion using CMU Dictionary and Heteronym parsing.
from __future__ import annotations
import re
from typing import Optional
import pywordsegment
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from .h2p import H2p
from .h2p import replace_first
from . import format_ph as ph
from .dict_reader import DictReader
from .text.numbers import normalize_numbers
from .filter import filter_text
from .processors import Processor
from copy import deepcopy
re_digit = re.compile(r"\((\d+)\)")
re_bracket_with_digit = re.compile(r"\(.*\)")
# Check that the nltk data is downloaded, if not, download it
try:
nltk.data.find('corpora/wordnet.zip')
nltk.data.find('corpora/omw-1.4.zip')
except LookupError:
nltk.download('wordnet')
nltk.download('omw-1.4')
class CMUDictExt:
def __init__(self, cmu_dict_path: str = None, h2p_dict_path: str = None, cmu_multi_mode: int = 0,
process_numbers: bool = True, phoneme_brackets: bool = True, unresolved_mode: str = 'keep'):
# noinspection GrazieInspection
"""
Initialize CMUDictExt - Extended Grapheme to Phoneme conversion using CMU Dictionary with Heteronym parsing.
CMU multi-entry resolution modes:
- -2 : Raw entry (i.e. 'A' resolves to 'AH0' and 'A(1)' to 'EY1')
- -1 : Skip resolving any entry with multiple pronunciations.
- 0 : Resolve using default un-numbered pronunciation.
- 1 : Resolve using (1) numbered pronunciation.
- n : Resolve using (n) numbered pronunciation.
- If a higher number is specified than available for the word, the highest available number is used.
Unresolved word resolution modes:
- keep : Keep the text-form word in the output.
- remove : Remove the text-form word from the output.
- drop : Return the line as None if any word is unresolved.
:param cmu_dict_path: Path to CMU dictionary file (.txt)
:type: str
:param h2p_dict_path: Path to Custom H2p dictionary (.json)
:type: str
:param cmu_multi_mode: CMU resolution mode for entries with multiple pronunciations.
:type: int
"""
# Check valid unresolved_mode argument
if unresolved_mode not in ['keep', 'remove', 'drop']:
raise ValueError('Invalid value for unresolved_mode: {}'.format(unresolved_mode))
self.unresolved_mode = unresolved_mode
self.cmu_dict_path = cmu_dict_path # Path to CMU dictionary file (.txt), if None, uses built-in
self.h2p_dict_path = h2p_dict_path # Path to Custom H2p dictionary (.json), if None, uses built-in
self.cmu_multi_mode = cmu_multi_mode # CMU multi-entry resolution mode
self.process_numbers = process_numbers # Normalize numbers to text form, if enabled
self.phoneme_brackets = phoneme_brackets # If True, phonemes are wrapped in curly brackets.
self.dict = DictReader(self.cmu_dict_path).dict # CMU Dictionary
self.h2p = H2p(self.h2p_dict_path, preload=True) # H2p parser
self.lemmatize = WordNetLemmatizer().lemmatize # WordNet Lemmatizer - used to find singular form
self.stem = SnowballStemmer('english').stem # Snowball Stemmer - used to find stem root of words
self.segment = pywordsegment.WordSegmenter().segment # Word Segmenter
self.p = Processor(self) # Processor for processing text
# Features
# Auto pluralization and de-pluralization
self.ft_auto_plural = True
# Auto splits and infers possessive forms of original words
self.ft_auto_pos = True
# Auto splits 'll
self.ft_auto_ll = True
# Auto splits and infers hyphenated words
self.ft_auto_hyphenated = True
# Auto splits possible compound words
self.ft_auto_compound = True
# Analyzes word root stem and infers pronunciation separately
# i.e. 'generously' -> 'generous' + 'ly'
self.ft_stem = True
# Forces compound words using manual lookup
self.ft_auto_compound_l2 = True
def lookup(self, text: str, pos: str = None, ph_format: str = 'sds') -> str | list | None:
# noinspection GrazieInspection
"""
Gets the CMU Dictionary entry for a word.
Options for ph_format:
- 'sds' space delimited string
- 'sds_b' space delimited string with curly brackets
- 'list' list of phoneme strings
:param pos: Part of speech tag (Optional)
:param ph_format: Format of the phonemes to return:
:type: str
:param text: Word to lookup
:type: str
"""
def format_as(in_phoneme):
if ph_format == 'sds':
output = ph.to_sds(in_phoneme)
elif ph_format == 'sds_b':
output = ph.with_cb(ph.to_sds(in_phoneme))
elif ph_format == 'list':
output = ph.to_list(in_phoneme)
else:
raise ValueError('Invalid value for ph_format: {}'.format(ph_format))
return output
# Get the CMU Dictionary entry for the word
word = text.lower()
entry = deepcopy(self.dict.get(word)) # Ensure safe copy of entry
# Has entry, return it directly
if entry is not None:
return format_as(entry)
# Auto Possessive Processor
if self.ft_auto_pos:
res = self.p.auto_possessives(word)
if res is not None:
return format_as(res)
# Auto Contractions for "ll" or "d"
if self.ft_auto_ll:
res = self.p.auto_contractions(word)
if res is not None:
return format_as(res)
# Check for hyphenated words
if self.ft_auto_hyphenated:
res = self.p.auto_hyphenated(word)
if res is not None:
return format_as(res)
# Check for compound words
if self.ft_auto_compound:
res = self.p.auto_compound(word)
if res is not None:
return format_as(res)
# No entry, detect if this is a multi-word entry
if '(' in word and ')' in word and any(char.isdigit() for char in word):
# Parse the integer from the word using regex
num = int(re.findall(re_digit, word)[0])
# If found
if num is not None:
# Remove the integer and bracket from the word
actual_word = re.sub(re_bracket_with_digit, "", word)
# See if this is a valid entry
result = deepcopy(self.dict.get(actual_word)) # Ensure safe copy of entry
# If found:
if result is not None:
# Translate the integer to index
index = min(num - 1, 0)
# Check if index is less than the number of pronunciations
if index < len(result):
# Return the entry using the provided num index
return format_as(result[index])
# If entry is higher
else:
# Return the highest available entry
return format_as(result[-1])
# Auto de-pluralization
# This is placed near the end because we need to do a pos-tag process
if self.ft_auto_plural:
res = self.p.auto_plural(word, pos)
if res is not None:
return format_as(res)
# Stem check
# noinspection SpellCheckingInspection
"""
Supported modes for words ending in:
"ing", "ingly", "ly"
"""
if self.ft_stem:
res = self.p.auto_stem(word)
if res is not None:
return format_as(res)
# Force compounding
if self.ft_auto_compound_l2:
res = self.p.auto_compound_l2(word)
if res is not None:
return format_as(res)
# If not found
return None
def convert(self, text: str) -> str | None:
# noinspection GrazieInspection
"""
Replace a grapheme text line with phonemes.
:param text: Text line to be converted
:type: str
"""
# Check valid unresolved_mode argument
if self.unresolved_mode not in ['keep', 'remove', 'drop']:
raise ValueError('Invalid value for unresolved_mode: {}'.format(self.unresolved_mode))
ur_mode = self.unresolved_mode
# Normalize numbers, if enabled
if self.process_numbers:
text = normalize_numbers(text)
# Filter and Tokenize
f_text = filter_text(text, preserve_case=True)
words = self.h2p.tokenize(f_text)
# Run POS tagging
tags = self.h2p.get_tags(words)
# Loop through words and pos tags
for word, pos in tags:
# Skip punctuation
if word == '.':
continue
# If word not in h2p dict, check CMU dict
if not self.h2p.dict.contains(word):
entry = self.lookup(word, pos)
if entry is None:
if ur_mode == 'drop':
return None
if ur_mode == 'remove':
text = replace_first(word, '', text)
continue
# Do replace
f_ph = ph.with_cb(ph.to_sds(entry))
text = replace_first(word, f_ph, text)
continue
# For word in h2p dict, get phonemes
phonemes = self.h2p.dict.get_phoneme(word, pos)
# Format phonemes
f_ph = ph.with_cb(ph.to_sds(phonemes))
# Replace word with phonemes
text = replace_first(word, f_ph, text)
# Return text
return text