Pendrokar's picture
ionite34's h2p_parser and dep required for English
2080fde
raw
history blame
No virus
3.6 kB
# This reads a CMUDict formatted dictionary as a dictionary object
import re
import h2p_parser.format_ph as ph
from . import DATA_PATH
_dict_primary = 'cmudict.dict'
def read_dict(filename: str) -> list:
# Read the file
with open(filename, encoding='utf-8', mode='r') as f:
# Read the file into lines
lines = f.readlines()
# Remove any line starting with ";;;"
lines = [line for line in lines if not line.startswith(';;;')]
return lines
def parse_dict(lines: list) -> dict:
# Create a dictionary to store the parsed data
parsed_dict = {}
# Detect file format
# We will read the first 10 lines to determine the format
# Default to SSD format unless we find otherwise
dict_form = 'SSD'
for line in lines[:10]:
# Strip new lines
line = line.strip()
if line == '':
continue
"""
Format 1 (Double Space Delimited):
- Comment allowed to start with ";;;"
WORD W ER1 D
Format 2 (Single Space Delimited):
- Comment allowed at end of any line using "#"
WORD W ER1 D # Comment
"""
if ' ' in line:
dict_form = 'DSD'
break
# Iterate over the lines
for line in lines:
# Skip empty lines and lines with no space
line = line.strip()
if line == '' and ' ' not in line:
continue
# Split depending on format
if dict_form == 'DSD':
pairs = line.split(' ')
else:
space_index = line.find(' ')
line_split = line[:space_index], line[space_index + 1:]
pairs = line_split[0], line_split[1].split('#')[0]
word = str.lower(pairs[0]) # Get word and lowercase it
phonemes = ph.to_list(pairs[1]) # Convert to list of phonemes
phonemes = [phonemes] # Wrap in nested list
word_num = 0
word_orig = None
# Detect if this is a multi-word entry
if ('(' in word) and (')' in word) and any(char.isdigit() for char in word):
# Parse the integer from the word using regex
result = int(re.findall(r"\((\d+)\)", word)[0])
# If found
if result is not None:
# Set the original word
word_orig = word
# Remove the integer and bracket from the word
word = re.sub(r"\(.*\)", "", word)
# Set the word number to the result
word_num = result
# Check existing key
if word in parsed_dict:
# If word number is 0, ignore
if word_num == 0:
continue
# If word number is not 0, add phoneme to existing key at index
parsed_dict[word].extend(phonemes)
# Also add the original word if it exists
if word_orig is not None:
parsed_dict[word_orig] = phonemes
else:
# Create a new key
parsed_dict[word] = phonemes
# Return the dictionary
return parsed_dict
class DictReader:
def __init__(self, filename=None):
self.filename = filename
self.dict = {}
# If filename is None, use the default dictionary
# default = 'data' uses the dictionary file in the data module
# default = 'nltk' uses the nltk cmudict
if filename is not None:
self.dict = parse_dict(read_dict(filename))
else:
with DATA_PATH.joinpath(_dict_primary) as f:
self.dict = parse_dict(read_dict(f))