Spaces:

Pendrokar
/

xVASynth-TTS

Running on CPU Upgrade

App Files Files Community

xVASynth-TTS / resources /app /python /xvapitch /text /h2p_parser /h2p.py

Pendrokar

ionite34's h2p_parser and dep required for English

2080fde 7 months ago

raw

history blame

No virus

4.29 kB

	import nltk
	import re
	from nltk.tokenize import TweetTokenizer
	from nltk import pos_tag
	from nltk import pos_tag_sents
	from .dictionary import Dictionary
	from .filter import filter_text as ft
	from . import format_ph as ph

	# Check that the nltk data is downloaded, if not, download it
	try:
	nltk.data.find('taggers/averaged_perceptron_tagger.zip')
	except LookupError:
	nltk.download('averaged_perceptron_tagger')


	# Method to use Regex to replace the first instance of a word with its phonemes
	def replace_first(target, replacement, text):
	# Skip if target invalid
	if target is None or target == '':
	return text
	# Replace the first instance of a word with its phonemes
	return re.sub(r'(?i)\b' + target + r'\b', replacement, text, 1)


	class H2p:
	def __init__(self, dict_path=None, preload=False, phoneme_format=''):
	"""
	Creates a H2p parser

	Supported phoneme formats:
	- Space delimited
	- Space delimited surrounded by { }

	:param dict_path: Path to a heteronym dictionary json file. Built-in dictionary will be used if None
	:type dict_path: str
	:param preload: Preloads the tokenizer and tagger during initialization
	:type preload: bool
	"""

	# Supported phoneme formats
	self.phoneme_format = phoneme_format
	self.dict = Dictionary(dict_path)
	self.tokenize = TweetTokenizer().tokenize
	self.get_tags = pos_tag
	if preload:
	self.preload()

	# Method to preload tokenizer and pos_tag
	def preload(self):
	tokens = self.tokenize('a')
	assert tokens == ['a']
	assert pos_tag(tokens)[0][0] == 'a'

	# Method to check if a text line contains a heteronym
	def contains_het(self, text):
	# Filter the text
	text = ft(text)
	# Tokenize
	words = self.tokenize(text)
	# Check match with dictionary
	hets = []
	for word in words:
	if self.dict.contains(word):
	hets.append(word)
	return len(hets)>0, hets

	# Method to replace heteronyms in a text line to phonemes
	def replace_het(self, text):
	# Filter the text
	working_text = ft(text, preserve_case=True)
	# Tokenize
	words = self.tokenize(working_text)
	# Get pos tags
	tags = pos_tag(words)
	# Loop through words and pos tags
	for word, pos in tags:
	# Skip if word not in dictionary
	if not self.dict.contains(word):
	continue
	# Get phonemes
	phonemes = self.dict.get_phoneme(word, pos)
	# Format phonemes
	f_ph = ph.with_cb(ph.to_sds(phonemes))
	# Replace word with phonemes
	text = replace_first(word, f_ph, text)
	return text

	# Replaces heteronyms in a list of text lines
	# Slightly faster than replace_het() called on each line
	def replace_het_list(self, text_list):
	# Filter the text
	working_text_list = [ft(text, preserve_case=True) for text in text_list]
	# Tokenize
	list_sentence_words = [self.tokenize(text) for text in working_text_list]
	# Get pos tags list
	tags_list = pos_tag_sents(list_sentence_words)
	# Loop through lines
	for index in range(len(tags_list)):
	# Loop through words and pos tags in tags_list index
	for word, pos in tags_list[index]:
	# Skip if word not in dictionary
	if not self.dict.contains(word):
	continue
	# Get phonemes
	phonemes = self.dict.get_phoneme(word, pos)
	# Format phonemes
	f_ph = ph.with_cb(ph.to_sds(phonemes))
	# Replace word with phonemes
	text_list[index] = replace_first(word, f_ph, text_list[index])
	return text_list

	# Method to tag a text line, returns a list of tags
	def tag(self, text):
	# Filter the text
	working_text = ft(text, preserve_case=True)
	# Tokenize
	words = self.tokenize(working_text)
	# Get pos tags
	tags = pos_tag(words)
	# Only return element 1 of each list
	return [tag[1] for tag in tags]