Spaces:

Pendrokar
/

xVASynth-TTS

Running on CPU Upgrade

App Files Files Community

xVASynth-TTS / resources /app /python /xvapitch /text /h2p_parser /dict_reader.py

Pendrokar

ionite34's h2p_parser and dep required for English

2080fde 7 months ago

raw

history blame

No virus

3.6 kB

	# This reads a CMUDict formatted dictionary as a dictionary object
	import re
	import h2p_parser.format_ph as ph
	from . import DATA_PATH


	_dict_primary = 'cmudict.dict'


	def read_dict(filename: str) -> list:
	# Read the file
	with open(filename, encoding='utf-8', mode='r') as f:
	# Read the file into lines
	lines = f.readlines()
	# Remove any line starting with ";;;"
	lines = [line for line in lines if not line.startswith(';;;')]
	return lines


	def parse_dict(lines: list) -> dict:
	# Create a dictionary to store the parsed data
	parsed_dict = {}
	# Detect file format

	# We will read the first 10 lines to determine the format
	# Default to SSD format unless we find otherwise
	dict_form = 'SSD'
	for line in lines[:10]:
	# Strip new lines
	line = line.strip()
	if line == '':
	continue
	"""
	Format 1 (Double Space Delimited):
	- Comment allowed to start with ";;;"
	WORD W ER1 D

	Format 2 (Single Space Delimited):
	- Comment allowed at end of any line using "#"
	WORD W ER1 D # Comment
	"""
	if ' ' in line:
	dict_form = 'DSD'
	break

	# Iterate over the lines
	for line in lines:
	# Skip empty lines and lines with no space
	line = line.strip()
	if line == '' and ' ' not in line:
	continue

	# Split depending on format
	if dict_form == 'DSD':
	pairs = line.split(' ')
	else:
	space_index = line.find(' ')
	line_split = line[:space_index], line[space_index + 1:]
	pairs = line_split[0], line_split[1].split('#')[0]

	word = str.lower(pairs[0]) # Get word and lowercase it
	phonemes = ph.to_list(pairs[1]) # Convert to list of phonemes
	phonemes = [phonemes] # Wrap in nested list
	word_num = 0
	word_orig = None

	# Detect if this is a multi-word entry
	if ('(' in word) and (')' in word) and any(char.isdigit() for char in word):
	# Parse the integer from the word using regex
	result = int(re.findall(r"\((\d+)\)", word)[0])
	# If found
	if result is not None:
	# Set the original word
	word_orig = word
	# Remove the integer and bracket from the word
	word = re.sub(r"\(.*\)", "", word)
	# Set the word number to the result
	word_num = result

	# Check existing key
	if word in parsed_dict:
	# If word number is 0, ignore
	if word_num == 0:
	continue
	# If word number is not 0, add phoneme to existing key at index
	parsed_dict[word].extend(phonemes)
	# Also add the original word if it exists
	if word_orig is not None:
	parsed_dict[word_orig] = phonemes
	else:
	# Create a new key
	parsed_dict[word] = phonemes

	# Return the dictionary
	return parsed_dict


	class DictReader:
	def __init__(self, filename=None):
	self.filename = filename
	self.dict = {}
	# If filename is None, use the default dictionary
	# default = 'data' uses the dictionary file in the data module
	# default = 'nltk' uses the nltk cmudict
	if filename is not None:
	self.dict = parse_dict(read_dict(filename))
	else:
	with DATA_PATH.joinpath(_dict_primary) as f:
	self.dict = parse_dict(read_dict(f))