Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
# Extended Grapheme to Phoneme conversion using CMU Dictionary and Heteronym parsing. | |
from __future__ import annotations | |
import re | |
from typing import Optional | |
import pywordsegment | |
import nltk | |
from nltk.stem import WordNetLemmatizer | |
from nltk.stem.snowball import SnowballStemmer | |
from .h2p import H2p | |
from .h2p import replace_first | |
from . import format_ph as ph | |
from .dict_reader import DictReader | |
from .text.numbers import normalize_numbers | |
from .filter import filter_text | |
from .processors import Processor | |
from copy import deepcopy | |
re_digit = re.compile(r"\((\d+)\)") | |
re_bracket_with_digit = re.compile(r"\(.*\)") | |
# Check that the nltk data is downloaded, if not, download it | |
try: | |
nltk.data.find('corpora/wordnet.zip') | |
nltk.data.find('corpora/omw-1.4.zip') | |
except LookupError: | |
nltk.download('wordnet') | |
nltk.download('omw-1.4') | |
class CMUDictExt: | |
def __init__(self, cmu_dict_path: str = None, h2p_dict_path: str = None, cmu_multi_mode: int = 0, | |
process_numbers: bool = True, phoneme_brackets: bool = True, unresolved_mode: str = 'keep'): | |
# noinspection GrazieInspection | |
""" | |
Initialize CMUDictExt - Extended Grapheme to Phoneme conversion using CMU Dictionary with Heteronym parsing. | |
CMU multi-entry resolution modes: | |
- -2 : Raw entry (i.e. 'A' resolves to 'AH0' and 'A(1)' to 'EY1') | |
- -1 : Skip resolving any entry with multiple pronunciations. | |
- 0 : Resolve using default un-numbered pronunciation. | |
- 1 : Resolve using (1) numbered pronunciation. | |
- n : Resolve using (n) numbered pronunciation. | |
- If a higher number is specified than available for the word, the highest available number is used. | |
Unresolved word resolution modes: | |
- keep : Keep the text-form word in the output. | |
- remove : Remove the text-form word from the output. | |
- drop : Return the line as None if any word is unresolved. | |
:param cmu_dict_path: Path to CMU dictionary file (.txt) | |
:type: str | |
:param h2p_dict_path: Path to Custom H2p dictionary (.json) | |
:type: str | |
:param cmu_multi_mode: CMU resolution mode for entries with multiple pronunciations. | |
:type: int | |
""" | |
# Check valid unresolved_mode argument | |
if unresolved_mode not in ['keep', 'remove', 'drop']: | |
raise ValueError('Invalid value for unresolved_mode: {}'.format(unresolved_mode)) | |
self.unresolved_mode = unresolved_mode | |
self.cmu_dict_path = cmu_dict_path # Path to CMU dictionary file (.txt), if None, uses built-in | |
self.h2p_dict_path = h2p_dict_path # Path to Custom H2p dictionary (.json), if None, uses built-in | |
self.cmu_multi_mode = cmu_multi_mode # CMU multi-entry resolution mode | |
self.process_numbers = process_numbers # Normalize numbers to text form, if enabled | |
self.phoneme_brackets = phoneme_brackets # If True, phonemes are wrapped in curly brackets. | |
self.dict = DictReader(self.cmu_dict_path).dict # CMU Dictionary | |
self.h2p = H2p(self.h2p_dict_path, preload=True) # H2p parser | |
self.lemmatize = WordNetLemmatizer().lemmatize # WordNet Lemmatizer - used to find singular form | |
self.stem = SnowballStemmer('english').stem # Snowball Stemmer - used to find stem root of words | |
self.segment = pywordsegment.WordSegmenter().segment # Word Segmenter | |
self.p = Processor(self) # Processor for processing text | |
# Features | |
# Auto pluralization and de-pluralization | |
self.ft_auto_plural = True | |
# Auto splits and infers possessive forms of original words | |
self.ft_auto_pos = True | |
# Auto splits 'll | |
self.ft_auto_ll = True | |
# Auto splits and infers hyphenated words | |
self.ft_auto_hyphenated = True | |
# Auto splits possible compound words | |
self.ft_auto_compound = True | |
# Analyzes word root stem and infers pronunciation separately | |
# i.e. 'generously' -> 'generous' + 'ly' | |
self.ft_stem = True | |
# Forces compound words using manual lookup | |
self.ft_auto_compound_l2 = True | |
def lookup(self, text: str, pos: str = None, ph_format: str = 'sds') -> str | list | None: | |
# noinspection GrazieInspection | |
""" | |
Gets the CMU Dictionary entry for a word. | |
Options for ph_format: | |
- 'sds' space delimited string | |
- 'sds_b' space delimited string with curly brackets | |
- 'list' list of phoneme strings | |
:param pos: Part of speech tag (Optional) | |
:param ph_format: Format of the phonemes to return: | |
:type: str | |
:param text: Word to lookup | |
:type: str | |
""" | |
def format_as(in_phoneme): | |
if ph_format == 'sds': | |
output = ph.to_sds(in_phoneme) | |
elif ph_format == 'sds_b': | |
output = ph.with_cb(ph.to_sds(in_phoneme)) | |
elif ph_format == 'list': | |
output = ph.to_list(in_phoneme) | |
else: | |
raise ValueError('Invalid value for ph_format: {}'.format(ph_format)) | |
return output | |
# Get the CMU Dictionary entry for the word | |
word = text.lower() | |
entry = deepcopy(self.dict.get(word)) # Ensure safe copy of entry | |
# Has entry, return it directly | |
if entry is not None: | |
return format_as(entry) | |
# Auto Possessive Processor | |
if self.ft_auto_pos: | |
res = self.p.auto_possessives(word) | |
if res is not None: | |
return format_as(res) | |
# Auto Contractions for "ll" or "d" | |
if self.ft_auto_ll: | |
res = self.p.auto_contractions(word) | |
if res is not None: | |
return format_as(res) | |
# Check for hyphenated words | |
if self.ft_auto_hyphenated: | |
res = self.p.auto_hyphenated(word) | |
if res is not None: | |
return format_as(res) | |
# Check for compound words | |
if self.ft_auto_compound: | |
res = self.p.auto_compound(word) | |
if res is not None: | |
return format_as(res) | |
# No entry, detect if this is a multi-word entry | |
if '(' in word and ')' in word and any(char.isdigit() for char in word): | |
# Parse the integer from the word using regex | |
num = int(re.findall(re_digit, word)[0]) | |
# If found | |
if num is not None: | |
# Remove the integer and bracket from the word | |
actual_word = re.sub(re_bracket_with_digit, "", word) | |
# See if this is a valid entry | |
result = deepcopy(self.dict.get(actual_word)) # Ensure safe copy of entry | |
# If found: | |
if result is not None: | |
# Translate the integer to index | |
index = min(num - 1, 0) | |
# Check if index is less than the number of pronunciations | |
if index < len(result): | |
# Return the entry using the provided num index | |
return format_as(result[index]) | |
# If entry is higher | |
else: | |
# Return the highest available entry | |
return format_as(result[-1]) | |
# Auto de-pluralization | |
# This is placed near the end because we need to do a pos-tag process | |
if self.ft_auto_plural: | |
res = self.p.auto_plural(word, pos) | |
if res is not None: | |
return format_as(res) | |
# Stem check | |
# noinspection SpellCheckingInspection | |
""" | |
Supported modes for words ending in: | |
"ing", "ingly", "ly" | |
""" | |
if self.ft_stem: | |
res = self.p.auto_stem(word) | |
if res is not None: | |
return format_as(res) | |
# Force compounding | |
if self.ft_auto_compound_l2: | |
res = self.p.auto_compound_l2(word) | |
if res is not None: | |
return format_as(res) | |
# If not found | |
return None | |
def convert(self, text: str) -> str | None: | |
# noinspection GrazieInspection | |
""" | |
Replace a grapheme text line with phonemes. | |
:param text: Text line to be converted | |
:type: str | |
""" | |
# Check valid unresolved_mode argument | |
if self.unresolved_mode not in ['keep', 'remove', 'drop']: | |
raise ValueError('Invalid value for unresolved_mode: {}'.format(self.unresolved_mode)) | |
ur_mode = self.unresolved_mode | |
# Normalize numbers, if enabled | |
if self.process_numbers: | |
text = normalize_numbers(text) | |
# Filter and Tokenize | |
f_text = filter_text(text, preserve_case=True) | |
words = self.h2p.tokenize(f_text) | |
# Run POS tagging | |
tags = self.h2p.get_tags(words) | |
# Loop through words and pos tags | |
for word, pos in tags: | |
# Skip punctuation | |
if word == '.': | |
continue | |
# If word not in h2p dict, check CMU dict | |
if not self.h2p.dict.contains(word): | |
entry = self.lookup(word, pos) | |
if entry is None: | |
if ur_mode == 'drop': | |
return None | |
if ur_mode == 'remove': | |
text = replace_first(word, '', text) | |
continue | |
# Do replace | |
f_ph = ph.with_cb(ph.to_sds(entry)) | |
text = replace_first(word, f_ph, text) | |
continue | |
# For word in h2p dict, get phonemes | |
phonemes = self.h2p.dict.get_phoneme(word, pos) | |
# Format phonemes | |
f_ph = ph.with_cb(ph.to_sds(phonemes)) | |
# Replace word with phonemes | |
text = replace_first(word, f_ph, text) | |
# Return text | |
return text | |