Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
# This reads a CMUDict formatted dictionary as a dictionary object | |
import re | |
import h2p_parser.format_ph as ph | |
from . import DATA_PATH | |
_dict_primary = 'cmudict.dict' | |
def read_dict(filename: str) -> list: | |
# Read the file | |
with open(filename, encoding='utf-8', mode='r') as f: | |
# Read the file into lines | |
lines = f.readlines() | |
# Remove any line starting with ";;;" | |
lines = [line for line in lines if not line.startswith(';;;')] | |
return lines | |
def parse_dict(lines: list) -> dict: | |
# Create a dictionary to store the parsed data | |
parsed_dict = {} | |
# Detect file format | |
# We will read the first 10 lines to determine the format | |
# Default to SSD format unless we find otherwise | |
dict_form = 'SSD' | |
for line in lines[:10]: | |
# Strip new lines | |
line = line.strip() | |
if line == '': | |
continue | |
""" | |
Format 1 (Double Space Delimited): | |
- Comment allowed to start with ";;;" | |
WORD W ER1 D | |
Format 2 (Single Space Delimited): | |
- Comment allowed at end of any line using "#" | |
WORD W ER1 D # Comment | |
""" | |
if ' ' in line: | |
dict_form = 'DSD' | |
break | |
# Iterate over the lines | |
for line in lines: | |
# Skip empty lines and lines with no space | |
line = line.strip() | |
if line == '' and ' ' not in line: | |
continue | |
# Split depending on format | |
if dict_form == 'DSD': | |
pairs = line.split(' ') | |
else: | |
space_index = line.find(' ') | |
line_split = line[:space_index], line[space_index + 1:] | |
pairs = line_split[0], line_split[1].split('#')[0] | |
word = str.lower(pairs[0]) # Get word and lowercase it | |
phonemes = ph.to_list(pairs[1]) # Convert to list of phonemes | |
phonemes = [phonemes] # Wrap in nested list | |
word_num = 0 | |
word_orig = None | |
# Detect if this is a multi-word entry | |
if ('(' in word) and (')' in word) and any(char.isdigit() for char in word): | |
# Parse the integer from the word using regex | |
result = int(re.findall(r"\((\d+)\)", word)[0]) | |
# If found | |
if result is not None: | |
# Set the original word | |
word_orig = word | |
# Remove the integer and bracket from the word | |
word = re.sub(r"\(.*\)", "", word) | |
# Set the word number to the result | |
word_num = result | |
# Check existing key | |
if word in parsed_dict: | |
# If word number is 0, ignore | |
if word_num == 0: | |
continue | |
# If word number is not 0, add phoneme to existing key at index | |
parsed_dict[word].extend(phonemes) | |
# Also add the original word if it exists | |
if word_orig is not None: | |
parsed_dict[word_orig] = phonemes | |
else: | |
# Create a new key | |
parsed_dict[word] = phonemes | |
# Return the dictionary | |
return parsed_dict | |
class DictReader: | |
def __init__(self, filename=None): | |
self.filename = filename | |
self.dict = {} | |
# If filename is None, use the default dictionary | |
# default = 'data' uses the dictionary file in the data module | |
# default = 'nltk' uses the nltk cmudict | |
if filename is not None: | |
self.dict = parse_dict(read_dict(filename)) | |
else: | |
with DATA_PATH.joinpath(_dict_primary) as f: | |
self.dict = parse_dict(read_dict(f)) | |