Spaces:

Pendrokar
/

xVASynth-TTS

Running on CPU Upgrade

App Files Files Community

xVASynth-TTS / resources /app /python /xvapitch /text /h2p_parser /cmudictext.py

Pendrokar

ionite34's h2p_parser and dep required for English

2080fde 9 months ago

raw

history blame

10 kB

	# Extended Grapheme to Phoneme conversion using CMU Dictionary and Heteronym parsing.
	from __future__ import annotations

	import re
	from typing import Optional

	import pywordsegment
	import nltk
	from nltk.stem import WordNetLemmatizer
	from nltk.stem.snowball import SnowballStemmer
	from .h2p import H2p
	from .h2p import replace_first
	from . import format_ph as ph
	from .dict_reader import DictReader
	from .text.numbers import normalize_numbers
	from .filter import filter_text
	from .processors import Processor
	from copy import deepcopy

	re_digit = re.compile(r"\((\d+)\)")
	re_bracket_with_digit = re.compile(r"\(.*\)")

	# Check that the nltk data is downloaded, if not, download it
	try:
	nltk.data.find('corpora/wordnet.zip')
	nltk.data.find('corpora/omw-1.4.zip')
	except LookupError:
	nltk.download('wordnet')
	nltk.download('omw-1.4')


	class CMUDictExt:
	def __init__(self, cmu_dict_path: str = None, h2p_dict_path: str = None, cmu_multi_mode: int = 0,
	process_numbers: bool = True, phoneme_brackets: bool = True, unresolved_mode: str = 'keep'):
	# noinspection GrazieInspection
	"""
	Initialize CMUDictExt - Extended Grapheme to Phoneme conversion using CMU Dictionary with Heteronym parsing.

	CMU multi-entry resolution modes:
	- -2 : Raw entry (i.e. 'A' resolves to 'AH0' and 'A(1)' to 'EY1')
	- -1 : Skip resolving any entry with multiple pronunciations.
	- 0 : Resolve using default un-numbered pronunciation.
	- 1 : Resolve using (1) numbered pronunciation.
	- n : Resolve using (n) numbered pronunciation.
	- If a higher number is specified than available for the word, the highest available number is used.

	Unresolved word resolution modes:
	- keep : Keep the text-form word in the output.
	- remove : Remove the text-form word from the output.
	- drop : Return the line as None if any word is unresolved.

	:param cmu_dict_path: Path to CMU dictionary file (.txt)
	:type: str
	:param h2p_dict_path: Path to Custom H2p dictionary (.json)
	:type: str
	:param cmu_multi_mode: CMU resolution mode for entries with multiple pronunciations.
	:type: int
	"""

	# Check valid unresolved_mode argument
	if unresolved_mode not in ['keep', 'remove', 'drop']:
	raise ValueError('Invalid value for unresolved_mode: {}'.format(unresolved_mode))
	self.unresolved_mode = unresolved_mode

	self.cmu_dict_path = cmu_dict_path # Path to CMU dictionary file (.txt), if None, uses built-in
	self.h2p_dict_path = h2p_dict_path # Path to Custom H2p dictionary (.json), if None, uses built-in
	self.cmu_multi_mode = cmu_multi_mode # CMU multi-entry resolution mode
	self.process_numbers = process_numbers # Normalize numbers to text form, if enabled
	self.phoneme_brackets = phoneme_brackets # If True, phonemes are wrapped in curly brackets.
	self.dict = DictReader(self.cmu_dict_path).dict # CMU Dictionary
	self.h2p = H2p(self.h2p_dict_path, preload=True) # H2p parser
	self.lemmatize = WordNetLemmatizer().lemmatize # WordNet Lemmatizer - used to find singular form
	self.stem = SnowballStemmer('english').stem # Snowball Stemmer - used to find stem root of words
	self.segment = pywordsegment.WordSegmenter().segment # Word Segmenter
	self.p = Processor(self) # Processor for processing text

	# Features
	# Auto pluralization and de-pluralization
	self.ft_auto_plural = True
	# Auto splits and infers possessive forms of original words
	self.ft_auto_pos = True
	# Auto splits 'll
	self.ft_auto_ll = True
	# Auto splits and infers hyphenated words
	self.ft_auto_hyphenated = True
	# Auto splits possible compound words
	self.ft_auto_compound = True
	# Analyzes word root stem and infers pronunciation separately
	# i.e. 'generously' -> 'generous' + 'ly'
	self.ft_stem = True
	# Forces compound words using manual lookup
	self.ft_auto_compound_l2 = True

	def lookup(self, text: str, pos: str = None, ph_format: str = 'sds') -> str \| list \| None:
	# noinspection GrazieInspection
	"""
	Gets the CMU Dictionary entry for a word.

	Options for ph_format:

	- 'sds' space delimited string
	- 'sds_b' space delimited string with curly brackets
	- 'list' list of phoneme strings

	:param pos: Part of speech tag (Optional)
	:param ph_format: Format of the phonemes to return:
	:type: str
	:param text: Word to lookup
	:type: str
	"""

	def format_as(in_phoneme):
	if ph_format == 'sds':
	output = ph.to_sds(in_phoneme)
	elif ph_format == 'sds_b':
	output = ph.with_cb(ph.to_sds(in_phoneme))
	elif ph_format == 'list':
	output = ph.to_list(in_phoneme)
	else:
	raise ValueError('Invalid value for ph_format: {}'.format(ph_format))
	return output

	# Get the CMU Dictionary entry for the word
	word = text.lower()
	entry = deepcopy(self.dict.get(word)) # Ensure safe copy of entry

	# Has entry, return it directly
	if entry is not None:
	return format_as(entry)

	# Auto Possessive Processor
	if self.ft_auto_pos:
	res = self.p.auto_possessives(word)
	if res is not None:
	return format_as(res)

	# Auto Contractions for "ll" or "d"
	if self.ft_auto_ll:
	res = self.p.auto_contractions(word)
	if res is not None:
	return format_as(res)

	# Check for hyphenated words
	if self.ft_auto_hyphenated:
	res = self.p.auto_hyphenated(word)
	if res is not None:
	return format_as(res)

	# Check for compound words
	if self.ft_auto_compound:
	res = self.p.auto_compound(word)
	if res is not None:
	return format_as(res)

	# No entry, detect if this is a multi-word entry
	if '(' in word and ')' in word and any(char.isdigit() for char in word):
	# Parse the integer from the word using regex
	num = int(re.findall(re_digit, word)[0])
	# If found
	if num is not None:
	# Remove the integer and bracket from the word
	actual_word = re.sub(re_bracket_with_digit, "", word)
	# See if this is a valid entry
	result = deepcopy(self.dict.get(actual_word)) # Ensure safe copy of entry
	# If found:
	if result is not None:
	# Translate the integer to index
	index = min(num - 1, 0)
	# Check if index is less than the number of pronunciations
	if index < len(result):
	# Return the entry using the provided num index
	return format_as(result[index])
	# If entry is higher
	else:
	# Return the highest available entry
	return format_as(result[-1])

	# Auto de-pluralization
	# This is placed near the end because we need to do a pos-tag process
	if self.ft_auto_plural:
	res = self.p.auto_plural(word, pos)
	if res is not None:
	return format_as(res)

	# Stem check
	# noinspection SpellCheckingInspection
	"""
	Supported modes for words ending in:
	"ing", "ingly", "ly"
	"""
	if self.ft_stem:
	res = self.p.auto_stem(word)
	if res is not None:
	return format_as(res)

	# Force compounding
	if self.ft_auto_compound_l2:
	res = self.p.auto_compound_l2(word)
	if res is not None:
	return format_as(res)

	# If not found
	return None

	def convert(self, text: str) -> str \| None:
	# noinspection GrazieInspection
	"""
	Replace a grapheme text line with phonemes.

	:param text: Text line to be converted
	:type: str
	"""

	# Check valid unresolved_mode argument
	if self.unresolved_mode not in ['keep', 'remove', 'drop']:
	raise ValueError('Invalid value for unresolved_mode: {}'.format(self.unresolved_mode))
	ur_mode = self.unresolved_mode

	# Normalize numbers, if enabled
	if self.process_numbers:
	text = normalize_numbers(text)
	# Filter and Tokenize
	f_text = filter_text(text, preserve_case=True)
	words = self.h2p.tokenize(f_text)
	# Run POS tagging
	tags = self.h2p.get_tags(words)

	# Loop through words and pos tags
	for word, pos in tags:
	# Skip punctuation
	if word == '.':
	continue
	# If word not in h2p dict, check CMU dict
	if not self.h2p.dict.contains(word):
	entry = self.lookup(word, pos)
	if entry is None:
	if ur_mode == 'drop':
	return None
	if ur_mode == 'remove':
	text = replace_first(word, '', text)
	continue
	# Do replace
	f_ph = ph.with_cb(ph.to_sds(entry))
	text = replace_first(word, f_ph, text)
	continue
	# For word in h2p dict, get phonemes
	phonemes = self.h2p.dict.get_phoneme(word, pos)
	# Format phonemes
	f_ph = ph.with_cb(ph.to_sds(phonemes))
	# Replace word with phonemes
	text = replace_first(word, f_ph, text)
	# Return text
	return text