Spaces:

Pendrokar
/

xVASynth-TTS

Running on CPU Upgrade

App Files Files Community

Pendrokar commited on Feb 11

Commit

2080fde

•

1 Parent(s): b2e5090

ionite34's h2p_parser and dep required for English

Browse files

Files changed (29) hide show

requirements.txt +3 -0
resources/app/python/xvapitch/text/h2p_parser/__init__.py +22 -0
resources/app/python/xvapitch/text/h2p_parser/__main__.py +185 -0
resources/app/python/xvapitch/text/h2p_parser/cmudictext.py +253 -0
resources/app/python/xvapitch/text/h2p_parser/compat/__init__.py +7 -0
resources/app/python/xvapitch/text/h2p_parser/compat/cmudict.py +19 -0
resources/app/python/xvapitch/text/h2p_parser/data/__init__.py +0 -0
resources/app/python/xvapitch/text/h2p_parser/data/cmudict-0.7b.txt +0 -0
resources/app/python/xvapitch/text/h2p_parser/data/cmudict.dict +0 -0
resources/app/python/xvapitch/text/h2p_parser/data/dict.json +1500 -0
resources/app/python/xvapitch/text/h2p_parser/data/example.json +16 -0
resources/app/python/xvapitch/text/h2p_parser/dict_reader.py +109 -0
resources/app/python/xvapitch/text/h2p_parser/dictionary.py +85 -0
resources/app/python/xvapitch/text/h2p_parser/filter.py +34 -0
resources/app/python/xvapitch/text/h2p_parser/format_ph.py +99 -0
resources/app/python/xvapitch/text/h2p_parser/h2p.py +123 -0
resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/PKG-INFO +14 -0
resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/SOURCES.txt +19 -0
resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/dependency_links.txt +1 -0
resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/requires.txt +2 -0
resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/top_level.txt +1 -0
resources/app/python/xvapitch/text/h2p_parser/pos_parser.py +17 -0
resources/app/python/xvapitch/text/h2p_parser/processors.py +392 -0
resources/app/python/xvapitch/text/h2p_parser/symbols.py +82 -0
resources/app/python/xvapitch/text/h2p_parser/text/__init__.py +0 -0
resources/app/python/xvapitch/text/h2p_parser/text/numbers.py +166 -0
resources/app/python/xvapitch/text/h2p_parser/utils/__init__.py +0 -0
resources/app/python/xvapitch/text/h2p_parser/utils/converter.py +79 -0
resources/app/python/xvapitch/text/h2p_parser/utils/parser.py +133 -0

requirements.txt CHANGED Viewed

@@ -26,9 +26,11 @@ idna==2.10
 importlib-metadata==2.0.0
 importlib-resources==5.2.2
 inflect==4.1.0
 jaconv==0.3
 joblib==0.17.0
 librosa
 num2words==0.5.10
 numpy
 omegaconf==2.1.1
@@ -43,6 +45,7 @@ pydub==0.25.1
 pykakasi==2.2.1
 pyparsing==2.4.7
 python-crfsuite==0.9.8
 PyYAML
 regex==2021.8.28
 requests==2.25.1

 importlib-metadata==2.0.0
 importlib-resources==5.2.2
 inflect==4.1.0
+inquirerpy~=0.3.3
 jaconv==0.3
 joblib==0.17.0
 librosa
+nltk~=3.7
 num2words==0.5.10
 numpy
 omegaconf==2.1.1
 pykakasi==2.2.1
 pyparsing==2.4.7
 python-crfsuite==0.9.8
+pywordsegment~=0.2.1
 PyYAML
 regex==2021.8.28
 requests==2.25.1

resources/app/python/xvapitch/text/h2p_parser/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+"""
+h2p_parser
+Heteronym to Phoneme Parser
+"""
+import sys
+if sys.version_info < (3, 9):
+    # In Python versions below 3.9, this is needed
+    from importlib_resources import files
+else:
+    # Since python 3.9+, importlib.resources.files is built-in
+    from importlib.resources import files
+__version__ = "1.0.0"
+# Data module
+DATA_PATH = files(__name__ + '.data')
+# Iterable collection of all files in data.
+DATA_FILES = DATA_PATH.iterdir()

resources/app/python/xvapitch/text/h2p_parser/__main__.py ADDED Viewed

	@@ -0,0 +1,185 @@

+from collections import Counter
+from InquirerPy import inquirer
+from InquirerPy.utils import patched_print, color_print
+from InquirerPy.base.control import Choice
+from InquirerPy.validator import PathValidator
+from h2p_parser.utils import converter
+from h2p_parser.utils import parser
+def convert_h2p(input_file, output_file, delimiter):
+    """
+    Converts a h2p dictionary file from one format to another.
+    """
+    converter.bin_delim_to_json(input_file, output_file, delimiter)
+    print('Converted h2p_dict to json.')
+def prompt_action() -> str:
+    action = inquirer.select(
+        message='Select action:',
+        choices=[
+            "Convert",
+            "Parse",
+            Choice(value=None, name='Exit')
+        ],
+        default=0,
+    ).execute()
+    if not action:
+        exit(0)
+    return action
+def prompt_f_input():
+    """
+    Prompts for input file.
+    """
+    return inquirer.filepath(
+        message='Select input file:',
+        validate=PathValidator(is_file=True, message='Input must be a file.')
+    ).execute()
+def prompt_f_output():
+    """
+    Prompts for output file.
+    """
+    return inquirer.filepath(
+        message='Select output file:',
+        validate=PathValidator(is_file=True, message='Output must be a file.')
+    ).execute()
+def action_convert():
+    """
+    Converts a h2p dictionary file from one format to another.
+    """
+    # Select input file
+    input_file = prompt_f_input()
+    if not input_file:
+        return
+    # Select output file
+    output_file = prompt_f_output()
+    if not output_file:
+        return
+    # Ask for delimiter
+    delimiter = inquirer.text(
+        message='Enter delimiter:',
+        default='|'
+    ).execute()
+    if not delimiter:
+        return
+    # Run Process
+    convert_h2p(input_file, output_file, delimiter)
+def action_parse_file():
+    """
+    Parses a metadata.csv file and checks for dictionary coverage
+    :return:
+    """
+    # Select input file
+    input_file = prompt_f_input()
+    if not input_file:
+        return
+    # Ask for delimiter
+    delimiter = inquirer.text(
+        message='Enter delimiter:',
+        default='|'
+    ).execute()
+    if not delimiter:
+        return
+    # Run Process
+    result = parser.check_lines(parser.read_file(input_file, delimiter))
+    # Print results
+    color_print([("#e5c07b", "Unresolved Words")])
+    color_print([("#d21205", "[All]: "),
+                 ("#ffffff", f"{len(result.unres_all_words)}/{len(result.all_words)}")])
+    color_print([("#7e3b41", "[Unique]: "),
+                 ("#ffffff", f"{len(result.unres_words)}/{len(result.words)}")])
+    color_print([("#4ce5c8", "-" * 10)])
+    color_print([("#e5c07b", "Unresolved Lines")])
+    color_print([("#d21205", "[All]: "),
+                 ("#ffffff", f"{len(result.unres_all_lines)}/{len(result.all_lines)}")])
+    color_print([("#7e3b41", "[Unique]: "),
+                 ("#ffffff", f"{len(result.unres_lines)}/{len(result.lines)}")])
+    color_print([("#4ce5c8", "-" * 10)])
+    color_print([("#e5c07b", "Expected Coverage")])
+    color_print([("#d21205", "[Lines]: "),
+                 ("#ffffff", f"{result.line_coverage()}%")])
+    color_print([("#7e3b41", "[Words]: "),
+                 ("#ffffff", f"{result.word_coverage()}%")])
+    color_print([("#4ce5c8", "-" * 10)])
+    color_print([("#e5c07b", "H2p parser")])
+    color_print([("#d21205", "[Lines with Heteronyms]: "),
+                 ("#ffffff", f"{len(result.all_lines_cont_het)}/{len(result.all_lines)}"
+                             f" | {result.percent_line_het()}%")])
+    color_print([("#7e3b41", "[Words Resolved by H2p]: "),
+                 ("#ffffff", f"{result.n_words_het}/{result.n_words_res}"
+                             f" | {result.percent_word_h2p()}%")])
+    # Calcs
+    feature_res = result.n_words_fet
+    feature_percent = round(feature_res / result.n_words_res * 100, 2)
+    cmu_res = result.n_words_cmu
+    cmu_percent = round(cmu_res / result.n_words_res * 100, 2)
+    color_print([("#c8bd20", "[Transformed Resolves]: "),
+                 ("#ffffff", f"{feature_res}/{result.n_words_res}"
+                             f" | {feature_percent}%")])
+    color_print([("#25a0c8", "[Words in CMUDict]: "),
+                 ("#ffffff", f"{cmu_res}/{result.n_words_res}"
+                             f" | {cmu_percent}%")])
+    color_print([("#4ce5c8", "-" * 10)])
+    color_print([("#e5c07b", "Feature Usage")])
+    # Loop through feature results
+    for ft in result.ft_stats:
+        color_print([("#d21205", f"{ft}: "),
+                     ("#ffffff", f"{result.ft_stats[ft]}/{result.n_words_res}"
+                                 f" | {round(result.ft_stats[ft]/result.n_words_res*100, 2)}%")])
+    color_print([("#4ce5c8", "-" * 10)])
+    # Print 100 sampled unresolved words by frequency
+    color_print([("#e5c07b", "Top 100 most frequent unresolved words")])
+    # Count frequency of words
+    word_freq = Counter(result.unres_all_words)
+    # Sort by frequency
+    word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
+    # Print top 100
+    for word, freq in word_freq[:100]:
+        color_print([("#d21205", f"{word}: "),
+                     ("#ffffff", f"{freq}")])
+def entry():
+    """
+    Prints help information.
+    """
+    # Select action type
+    action = prompt_action()
+    if action == 'Convert':
+        action_convert()
+    elif action == 'Parse':
+        action_parse_file()
+if __name__ == "__main__":
+    entry()

resources/app/python/xvapitch/text/h2p_parser/cmudictext.py ADDED Viewed

	@@ -0,0 +1,253 @@

+# Extended Grapheme to Phoneme conversion using CMU Dictionary and Heteronym parsing.
+from __future__ import annotations
+import re
+from typing import Optional
+import pywordsegment
+import nltk
+from nltk.stem import WordNetLemmatizer
+from nltk.stem.snowball import SnowballStemmer
+from .h2p import H2p
+from .h2p import replace_first
+from . import format_ph as ph
+from .dict_reader import DictReader
+from .text.numbers import normalize_numbers
+from .filter import filter_text
+from .processors import Processor
+from copy import deepcopy
+re_digit = re.compile(r"\((\d+)\)")
+re_bracket_with_digit = re.compile(r"\(.*\)")
+# Check that the nltk data is downloaded, if not, download it
+try:
+    nltk.data.find('corpora/wordnet.zip')
+    nltk.data.find('corpora/omw-1.4.zip')
+except LookupError:
+    nltk.download('wordnet')
+    nltk.download('omw-1.4')
+class CMUDictExt:
+    def __init__(self, cmu_dict_path: str = None, h2p_dict_path: str = None, cmu_multi_mode: int = 0,
+                 process_numbers: bool = True, phoneme_brackets: bool = True, unresolved_mode: str = 'keep'):
+        # noinspection GrazieInspection
+        """
+        Initialize CMUDictExt - Extended Grapheme to Phoneme conversion using CMU Dictionary with Heteronym parsing.
+        CMU multi-entry resolution modes:
+            - -2 : Raw entry (i.e. 'A' resolves to 'AH0' and 'A(1)' to 'EY1')
+            - -1 : Skip resolving any entry with multiple pronunciations.
+            - 0 : Resolve using default un-numbered pronunciation.
+            - 1 : Resolve using (1) numbered pronunciation.
+            - n : Resolve using (n) numbered pronunciation.
+            - If a higher number is specified than available for the word, the highest available number is used.
+        Unresolved word resolution modes:
+            - keep : Keep the text-form word in the output.
+            - remove : Remove the text-form word from the output.
+            - drop : Return the line as None if any word is unresolved.
+        :param cmu_dict_path: Path to CMU dictionary file (.txt)
+        :type: str
+        :param h2p_dict_path: Path to Custom H2p dictionary (.json)
+        :type: str
+        :param cmu_multi_mode: CMU resolution mode for entries with multiple pronunciations.
+        :type: int
+        """
+        # Check valid unresolved_mode argument
+        if unresolved_mode not in ['keep', 'remove', 'drop']:
+            raise ValueError('Invalid value for unresolved_mode: {}'.format(unresolved_mode))
+        self.unresolved_mode = unresolved_mode
+        self.cmu_dict_path = cmu_dict_path  # Path to CMU dictionary file (.txt), if None, uses built-in
+        self.h2p_dict_path = h2p_dict_path  # Path to Custom H2p dictionary (.json), if None, uses built-in
+        self.cmu_multi_mode = cmu_multi_mode  # CMU multi-entry resolution mode
+        self.process_numbers = process_numbers  # Normalize numbers to text form, if enabled
+        self.phoneme_brackets = phoneme_brackets  # If True, phonemes are wrapped in curly brackets.
+        self.dict = DictReader(self.cmu_dict_path).dict  # CMU Dictionary
+        self.h2p = H2p(self.h2p_dict_path, preload=True)  # H2p parser
+        self.lemmatize = WordNetLemmatizer().lemmatize  # WordNet Lemmatizer - used to find singular form
+        self.stem = SnowballStemmer('english').stem  # Snowball Stemmer - used to find stem root of words
+        self.segment = pywordsegment.WordSegmenter().segment  # Word Segmenter
+        self.p = Processor(self)  # Processor for processing text
+        # Features
+        # Auto pluralization and de-pluralization
+        self.ft_auto_plural = True
+        # Auto splits and infers possessive forms of original words
+        self.ft_auto_pos = True
+        # Auto splits 'll
+        self.ft_auto_ll = True
+        # Auto splits and infers hyphenated words
+        self.ft_auto_hyphenated = True
+        # Auto splits possible compound words
+        self.ft_auto_compound = True
+        # Analyzes word root stem and infers pronunciation separately
+        # i.e. 'generously' -> 'generous' + 'ly'
+        self.ft_stem = True
+        # Forces compound words using manual lookup
+        self.ft_auto_compound_l2 = True
+    def lookup(self, text: str, pos: str = None, ph_format: str = 'sds') -> str | list | None:
+        # noinspection GrazieInspection
+        """
+        Gets the CMU Dictionary entry for a word.
+        Options for ph_format:
+        - 'sds' space delimited string
+        - 'sds_b' space delimited string with curly brackets
+        - 'list' list of phoneme strings
+        :param pos: Part of speech tag (Optional)
+        :param ph_format: Format of the phonemes to return:
+        :type: str
+        :param text: Word to lookup
+        :type: str
+        """
+        def format_as(in_phoneme):
+            if ph_format == 'sds':
+                output = ph.to_sds(in_phoneme)
+            elif ph_format == 'sds_b':
+                output = ph.with_cb(ph.to_sds(in_phoneme))
+            elif ph_format == 'list':
+                output = ph.to_list(in_phoneme)
+            else:
+                raise ValueError('Invalid value for ph_format: {}'.format(ph_format))
+            return output
+        # Get the CMU Dictionary entry for the word
+        word = text.lower()
+        entry = deepcopy(self.dict.get(word))  # Ensure safe copy of entry
+        # Has entry, return it directly
+        if entry is not None:
+            return format_as(entry)
+        # Auto Possessive Processor
+        if self.ft_auto_pos:
+            res = self.p.auto_possessives(word)
+            if res is not None:
+                return format_as(res)
+        # Auto Contractions for "ll" or "d"
+        if self.ft_auto_ll:
+            res = self.p.auto_contractions(word)
+            if res is not None:
+                return format_as(res)
+        # Check for hyphenated words
+        if self.ft_auto_hyphenated:
+            res = self.p.auto_hyphenated(word)
+            if res is not None:
+                return format_as(res)
+        # Check for compound words
+        if self.ft_auto_compound:
+            res = self.p.auto_compound(word)
+            if res is not None:
+                return format_as(res)
+        # No entry, detect if this is a multi-word entry
+        if '(' in word and ')' in word and any(char.isdigit() for char in word):
+            # Parse the integer from the word using regex
+            num = int(re.findall(re_digit, word)[0])
+            # If found
+            if num is not None:
+                # Remove the integer and bracket from the word
+                actual_word = re.sub(re_bracket_with_digit, "", word)
+                # See if this is a valid entry
+                result = deepcopy(self.dict.get(actual_word))  # Ensure safe copy of entry
+                # If found:
+                if result is not None:
+                    # Translate the integer to index
+                    index = min(num - 1, 0)
+                    # Check if index is less than the number of pronunciations
+                    if index < len(result):
+                        # Return the entry using the provided num index
+                        return format_as(result[index])
+                    # If entry is higher
+                    else:
+                        # Return the highest available entry
+                        return format_as(result[-1])
+        # Auto de-pluralization
+        # This is placed near the end because we need to do a pos-tag process
+        if self.ft_auto_plural:
+            res = self.p.auto_plural(word, pos)
+            if res is not None:
+                return format_as(res)
+        # Stem check
+        # noinspection SpellCheckingInspection
+        """
+        Supported modes for words ending in:
+        "ing", "ingly", "ly"
+        """
+        if self.ft_stem:
+            res = self.p.auto_stem(word)
+            if res is not None:
+                return format_as(res)
+        # Force compounding
+        if self.ft_auto_compound_l2:
+            res = self.p.auto_compound_l2(word)
+            if res is not None:
+                return format_as(res)
+        # If not found
+        return None
+    def convert(self, text: str) -> str | None:
+        # noinspection GrazieInspection
+        """
+        Replace a grapheme text line with phonemes.
+        :param text: Text line to be converted
+        :type: str
+        """
+        # Check valid unresolved_mode argument
+        if self.unresolved_mode not in ['keep', 'remove', 'drop']:
+            raise ValueError('Invalid value for unresolved_mode: {}'.format(self.unresolved_mode))
+        ur_mode = self.unresolved_mode
+        # Normalize numbers, if enabled
+        if self.process_numbers:
+            text = normalize_numbers(text)
+        # Filter and Tokenize
+        f_text = filter_text(text, preserve_case=True)
+        words = self.h2p.tokenize(f_text)
+        # Run POS tagging
+        tags = self.h2p.get_tags(words)
+        # Loop through words and pos tags
+        for word, pos in tags:
+            # Skip punctuation
+            if word == '.':
+                continue
+            # If word not in h2p dict, check CMU dict
+            if not self.h2p.dict.contains(word):
+                entry = self.lookup(word, pos)
+                if entry is None:
+                    if ur_mode == 'drop':
+                        return None
+                    if ur_mode == 'remove':
+                        text = replace_first(word, '', text)
+                    continue
+                # Do replace
+                f_ph = ph.with_cb(ph.to_sds(entry))
+                text = replace_first(word, f_ph, text)
+                continue
+            # For word in h2p dict, get phonemes
+            phonemes = self.h2p.dict.get_phoneme(word, pos)
+            # Format phonemes
+            f_ph = ph.with_cb(ph.to_sds(phonemes))
+            # Replace word with phonemes
+            text = replace_first(word, f_ph, text)
+        # Return text
+        return text

resources/app/python/xvapitch/text/h2p_parser/compat/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""
+Compatibility module.
+This module contains compatibility wrappers for existing
+implementations of CMUDict and other dictionaries.
+"""

resources/app/python/xvapitch/text/h2p_parser/compat/cmudict.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Compatibility layer for using CMUDictExt with CMUDict-like API calls.
+# Designed to be compatible with the implementation of CMUDict in:
+# https://github.com/NVIDIA/DeepLearningExamples/
+#
+# Example usage:
+#   from h2p_parser.compat.cmudict import CMUDict
+from h2p_parser.cmudictext import CMUDictExt
+class CMUDict(CMUDictExt):
+    def __init__(self, file_or_path=None, heteronyms_path=None, keep_ambiguous=True):
+        # Parameter Mapping:
+        # file_or_path => Mapped to cmu_dict_path
+        # heteronyms_path => Dropped as CMUDictExt uses H2p for heteronym parsing.
+        # keep_ambiguous => Mapped to cmu_multi_mode | True => -2, False => -1
+        super().__init__(file_or_path, heteronyms_path)
+        self._entries = {}
+        self.heteronyms = []

resources/app/python/xvapitch/text/h2p_parser/data/__init__.py ADDED Viewed

File without changes

resources/app/python/xvapitch/text/h2p_parser/data/cmudict-0.7b.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

resources/app/python/xvapitch/text/h2p_parser/data/cmudict.dict ADDED Viewed

The diff for this file is too large to render. See raw diff

resources/app/python/xvapitch/text/h2p_parser/data/dict.json ADDED Viewed

	@@ -0,0 +1,1500 @@

+{
+    "absent": {
+        "DEFAULT": "AE1 B S AH0 N T",
+        "VERB": "AH1 B S AE1 N T"
+    },
+    "abstract": {
+        "DEFAULT": "AE1 B S T R AE2 K T",
+        "VERB": "AE0 B S T R AE1 K T"
+    },
+    "abstracts": {
+        "DEFAULT": "AE1 B S T R AE0 K T S",
+        "VERB": "AE0 B S T R AE1 K T S"
+    },
+    "abuse": {
+        "DEFAULT": "AH0 B Y UW1 S",
+        "VERB": "AH0 B Y UW1 Z"
+    },
+    "abuses": {
+        "DEFAULT": "AH0 B Y UW1 S IH0 Z",
+        "VERB": "AH0 B Y UW1 Z IH0 Z"
+    },
+    "accent": {
+        "DEFAULT": "AE1 K S EH2 N T",
+        "VERB": "AH0 K S EH1 N T"
+    },
+    "accents": {
+        "DEFAULT": "AE1 K S EH0 N T S",
+        "VERB": "AE1 K S EH0 N T S"
+    },
+    "addict": {
+        "DEFAULT": "AE1 D IH2 K T",
+        "VERB": "AH0 D IH1 K T"
+    },
+    "addicts": {
+        "DEFAULT": "AE1 D IH2 K T S",
+        "VERB": "AH0 D IH1 K T S"
+    },
+    "advocate": {
+        "DEFAULT": "AE1 D V AH0 K AH0 T",
+        "VERB": "AE1 D V AH0 K EY2 T"
+    },
+    "advocates": {
+        "DEFAULT": "AE1 D V AH0 K AH0 T S",
+        "VERB": "AE1 D V AH0 K EY2 T S"
+    },
+    "affect": {
+        "DEFAULT": "AE1 F EH0 K T",
+        "VERB": "AH0 F EH1 K T"
+    },
+    "affects": {
+        "DEFAULT": "AE1 F EH0 K T S",
+        "VERB": "AH0 F EH1 K T S"
+    },
+    "affix": {
+        "DEFAULT": "AE1 F IH0 K S",
+        "VERB": "AH0 F IH1 K S"
+    },
+    "affixes": {
+        "DEFAULT": "AE1 F IH0 K S IH0 Z",
+        "VERB": "AH0 F IH1 K S IH0 Z"
+    },
+    "agglomerate": {
+        "DEFAULT": "AH0 G L AA1 M ER0 AH0 T",
+        "VERB": "AH0 G L AA1 M ER0 EY2 T"
+    },
+    "aggregate": {
+        "DEFAULT": "AE1 G R AH0 G AH0 T",
+        "VERB": "AE1 G R AH0 G EY0 T"
+    },
+    "aggregates": {
+        "DEFAULT": "AE1 G R AH0 G IH0 T S",
+        "VERB": "AE1 G R AH0 G EY2 T S"
+    },
+    "allies": {
+        "DEFAULT": "AE1 L AY0 Z",
+        "VERB": "AH0 L AY1 Z"
+    },
+    "alloy": {
+        "DEFAULT": "AE1 L OY2",
+        "VERB": "AH0 L OY1"
+    },
+    "alloys": {
+        "DEFAULT": "AE1 L OY2 Z",
+        "VERB": "AH0 L OY1 Z"
+    },
+    "ally": {
+        "DEFAULT": "AE1 L AY0",
+        "VERB": "AH0 L AY1"
+    },
+    "alternate": {
+        "DEFAULT": "AO0 L T ER1 N AH0 T",
+        "VERB": "AO1 L T ER0 N EY2 T"
+    },
+    "analyses": {
+        "DEFAULT": "AE1 N AH0 L AY0 Z IH2 Z",
+        "VERB": "AH0 N AE1 L IH0 S IY2 Z"
+    },
+    "animate": {
+        "DEFAULT": "AE1 N AH0 M AH0 T",
+        "VERB": "AE1 N AH0 M EY2 T"
+    },
+    "annex": {
+        "DEFAULT": "AE1 N EH2 K S",
+        "VERB": "AH0 N EH1 K S"
+    },
+    "annexes": {
+        "DEFAULT": "AE1 N EH2 K S IH0 Z",
+        "VERB": "AH0 N EH1 K S IH0 Z"
+    },
+    "appropriate": {
+        "DEFAULT": "AH0 P R OW1 P R IY0 AH0 T",
+        "VERB": "AH0 P R OW1 P R IY0 EY2 T"
+    },
+    "approximate": {
+        "DEFAULT": "AH0 P R AA1 K S AH0 M AH0 T",
+        "VERB": "AH0 P R AA1 K S AH0 M EY2 T"
+    },
+    "articulate": {
+        "DEFAULT": "AA0 R T IH1 K Y AH0 L EY2 T",
+        "VERB": "AA0 R T IH1 K Y AH0 L AH0 T"
+    },
+    "aspirate": {
+        "DEFAULT": "AE1 S P ER0 AH0 T",
+        "VERB": "AE1 S P ER0 EY2 T"
+    },
+    "aspirates": {
+        "DEFAULT": "AE1 S P ER0 AH0 T S",
+        "VERB": "AE1 S P ER0 EY2 T S"
+    },
+    "associate": {
+        "DEFAULT": "AH0 S OW1 S IY0 AH0 T",
+        "VERB": "AH0 S OW1 S IY0 EY2 T"
+    },
+    "associates": {
+        "DEFAULT": "AH0 S OW1 S IY0 AH0 T S",
+        "VERB": "AH0 S OW1 S IY0 EY2 T S"
+    },
+    "attribute": {
+        "DEFAULT": "AE1 T R IH0 B Y UW0 T",
+        "VERB": "AH0 T R IH1 B Y UW2 T"
+    },
+    "attributes": {
+        "DEFAULT": "AE1 T R IH0 B Y UW0 T S",
+        "VERB": "AH0 T R IH1 B Y UW2 T S"
+    },
+    "baths": {
+        "DEFAULT": "B AE1 DH Z",
+        "VERB": "B AE1 TH S"
+    },
+    "blessed": {
+        "DEFAULT": "B L EH1 S T",
+        "VERB": "B L EH1 S IH0 D"
+    },
+    "certificate": {
+        "DEFAULT": "S ER0 T IH1 F IH0 K EY2 T",
+        "VERB": "S ER0 T IH1 F IH0 K AH0 T"
+    },
+    "certificates": {
+        "DEFAULT": "S ER0 T IH1 F IH0 K AH0 T S",
+        "VERB": "S ER0 T IH1 F IH0 K EY2 T S"
+    },
+    "close": {
+        "DEFAULT": "K L OW1 S",
+        "VERB": "K L OW1 Z"
+    },
+    "closer": {
+        "DEFAULT": "K L OW1 S ER0",
+        "NOUN": "K L OW1 Z ER0"
+    },
+    "closes": {
+        "DEFAULT": "K L OW1 S IH0 Z",
+        "VERB": "K L OW1 Z IH0 Z"
+    },
+    "collect": {
+        "DEFAULT": "K AA1 L EH0 K T",
+        "VERB": "K AH0 L EH1 K T"
+    },
+    "collects": {
+        "DEFAULT": "K AA1 L EH0 K T S",
+        "VERB": "K AH0 L EH1 K T S"
+    },
+    "combat": {
+        "DEFAULT": "K AA1 M B AE0 T",
+        "VERB": "K AH0 M B AE1 T"
+    },
+    "combats": {
+        "DEFAULT": "K AH1 M B AE0 T S",
+        "VERB": "K AH0 M B AE1 T S"
+    },
+    "combine": {
+        "DEFAULT": "K AA1 M B AY0 N",
+        "VERB": "K AH0 M B AY1 N"
+    },
+    "commune": {
+        "DEFAULT": "K AA1 M Y UW0 N",
+        "VERB": "K AH0 M Y UW1 N"
+    },
+    "communes": {
+        "DEFAULT": "K AA1 M Y UW0 N Z",
+        "VERB": "K AH0 M Y UW1 N Z"
+    },
+    "compact": {
+        "DEFAULT": "K AA1 M P AE0 K T",
+        "VERB": "K AH0 M P AE1 K T"
+    },
+    "compacts": {
+        "DEFAULT": "K AA1 M P AE0 K T S",
+        "VERB": "K AH0 M P AE1 K T S"
+    },
+    "complex": {
+        "ADJ": "K AH0 M P L EH1 K S",
+        "DEFAULT": " K AA1 M P L EH0 K S"
+    },
+    "compliment": {
+        "DEFAULT": "K AA1 M P L AH0 M AH0 N T",
+        "VERB": "K AA1 M P L AH0 M EH0 N T"
+    },
+    "compliments": {
+        "DEFAULT": "K AA1 M P L AH0 M AH0 N T S",
+        "VERB": "K AA1 M P L AH0 M EH0 N T S"
+    },
+    "compound": {
+        "DEFAULT": "K AA1 M P AW0 N D",
+        "VERB": "K AH0 M P AW1 N D"
+    },
+    "compounds": {
+        "DEFAULT": "K AA1 M P AW0 N D Z",
+        "VERB": "K AH0 M P AW1 N D Z"
+    },
+    "compress": {
+        "DEFAULT": "K AA1 M P R EH0 S",
+        "VERB": "K AH0 M P R EH1 S"
+    },
+    "compresses": {
+        "DEFAULT": "K AA1 M P R EH0 S AH0 Z",
+        "VERB": "K AH0 M P R EH1 S IH0 Z"
+    },
+    "concert": {
+        "DEFAULT": "K AA1 N S ER0 T",
+        "VERB": "K AH0 N S ER1 T"
+    },
+    "concerts": {
+        "DEFAULT": "K AA1 N S ER0 T S",
+        "VERB": "K AH0 N S ER1 T S"
+    },
+    "conduct": {
+        "DEFAULT": "K AA1 N D AH0 K T",
+        "VERB": "K AA0 N D AH1 K T"
+    },
+    "confederate": {
+        "DEFAULT": "K AH0 N F EH1 D ER0 AH0 T",
+        "VERB": "K AH0 N F EH1 D ER0 EY2 T"
+    },
+    "confederates": {
+        "DEFAULT": "K AH0 N F EH1 D ER0 AH0 T S",
+        "VERB": "K AH0 N F EH1 D ER0 EY2 T S"
+    },
+    "confines": {
+        "DEFAULT": "K AA1 N F AY2 N Z",
+        "VERB": "K AH0 N F AY1 N Z"
+    },
+    "conflict": {
+        "DEFAULT": "K AA1 N F L IH0 K T",
+        "VERB": "K AH0 N F L IH1 K T"
+    },
+    "conflicts": {
+        "DEFAULT": "K AA1 N F L IH0 K T S",
+        "VERB": "K AH0 N F L IH1 K T S"
+    },
+    "conglomerate": {
+        "DEFAULT": "K AH0 N G L AA1 M ER0 AH0 T",
+        "VERB": "K AH0 N G L AA1 M ER0 EY2 T"
+    },
+    "conglomerates": {
+        "DEFAULT": "K AH0 N G L AA1 M ER0 AH0 T S",
+        "VERB": "K AH0 N G L AA1 M ER0 EY2 T S"
+    },
+    "conscript": {
+        "DEFAULT": "K AA1 N S K R IH0 P T",
+        "VERB": "K AH0 N S K R IH1 P T"
+    },
+    "conscripts": {
+        "DEFAULT": "K AA1 N S K R IH0 P T S",
+        "VERB": "K AH0 N S K R IH1 P T S"
+    },
+    "console": {
+        "DEFAULT": "K AA1 N S OW0 L",
+        "VERB": "K AH0 N S OW1 L"
+    },
+    "consoles": {
+        "DEFAULT": "K AA1 N S OW0 L Z",
+        "VERB": "K AH0 N S OW1 L Z"
+    },
+    "consort": {
+        "DEFAULT": "K AA1 N S AO0 R T",
+        "VERB": "K AH0 N S AO1 R T"
+    },
+    "construct": {
+        "DEFAULT": "K AA1 N S T R AH0 K T",
+        "VERB": "K AH0 N S T R AH1 K T"
+    },
+    "constructs": {
+        "DEFAULT": "K AA1 N S T R AH0 K T S",
+        "VERB": "K AH0 N S T R AH1 K T S"
+    },
+    "consummate": {
+        "DEFAULT": "K AA0 N S AH1 M AH0 T",
+        "VERB": "K AA1 N S AH0 M EY2 T"
+    },
+    "content": {
+        "DEFAULT": "K AH0 N T EH1 N T",
+        "NOUN": "K AA1 N T EH0 N T"
+    },
+    "contents": {
+        "DEFAULT": "K AA1 N T EH0 N T S",
+        "VERB": "K AH0 N T EH1 N T S"
+    },
+    "contest": {
+        "DEFAULT": "K AA1 N T EH0 S T",
+        "VERB": "K AH0 N T EH1 S T"
+    },
+    "contests": {
+        "DEFAULT": "K AA1 N T EH0 S T S",
+        "VERB": "K AH0 N T EH1 S T S"
+    },
+    "contract": {
+        "DEFAULT": "K AA1 N T R AE2 K T",
+        "VERB": "K AH0 N T R AE1 K T"
+    },
+    "contracts": {
+        "DEFAULT": "K AA1 N T R AE2 K T S",
+        "VERB": "K AH0 N T R AE1 K T S"
+    },
+    "contrast": {
+        "DEFAULT": "K AA1 N T R AE0 S T",
+        "VERB": "K AH0 N T R AE1 S T"
+    },
+    "contrasts": {
+        "DEFAULT": "K AA1 N T R AE0 S T S",
+        "VERB": "K AH0 N T R AE1 S T S"
+    },
+    "converse": {
+        "DEFAULT": "K AA1 N V ER0 S",
+        "VERB": "K AH0 N V ER1 S"
+    },
+    "convert": {
+        "DEFAULT": "K AA1 N V ER0 T",
+        "VERB": "K AH0 N V ER1 T"
+    },
+    "converts": {
+        "DEFAULT": "K AA1 N V ER0 T S",
+        "VERB": "K AH0 N V ER1 T S"
+    },
+    "convict": {
+        "DEFAULT": "K AA1 N V IH0 K T",
+        "VERB": "K AH0 N V IH1 K T"
+    },
+    "convicts": {
+        "DEFAULT": "K AA1 N V IH0 K T S",
+        "VERB": "K AH0 N V IH1 K T S"
+    },
+    "coordinate": {
+        "DEFAULT": "K OW0 AO1 R D AH0 N AH0 T",
+        "VERB": "K OW0 AO1 R D AH0 N EY2 T"
+    },
+    "coordinates": {
+        "DEFAULT": "K OW0 AO1 R D AH0 N AH0 T S",
+        "VERB": "K OW0 AO1 R D AH0 N EY2 T S"
+    },
+    "counterbalance": {
+        "DEFAULT": "K AW2 N T ER0 B AE1 L AH0 N S",
+        "VERB": "K AW1 N T ER0 B AE2 L AH0 N S"
+    },
+    "counterbalances": {
+        "DEFAULT": "K AW1 N T ER0 B AE2 L AH0 N S IH0 Z",
+        "VERB": "K AW2 N T ER0 B AE1 L AH0 N S IH0 Z"
+    },
+    "crabbed": {
+        "DEFAULT": "K R AE1 B IH0 D",
+        "VERB": "K R AE1 B D"
+    },
+    "crooked": {
+        "DEFAULT": "K R UH1 K AH0 D",
+        "VERB": "K R UH1 K T"
+    },
+    "curate": {
+        "DEFAULT": "K Y UH1 R AH0 T",
+        "VERB": "K Y UH0 R AH1 T"
+    },
+    "cursed": {
+        "DEFAULT": "K ER1 S IH0 D",
+        "VERB": "K ER1 S T"
+    },
+    "decoy": {
+        "DEFAULT": "D IY1 K OY0",
+        "VERB": "D IY0 K OY1"
+    },
+    "decoys": {
+        "DEFAULT": "D IY1 K OY0 Z",
+        "VERB": "D IY0 K OY1 Z"
+    },
+    "decrease": {
+        "DEFAULT": "D IY1 K R IY2 S",
+        "VERB": "D IH0 K R IY1 S"
+    },
+    "decreases": {
+        "DEFAULT": "D IY1 K R IY2 S IH0 Z",
+        "VERB": "D IH0 K R IY1 S IH0 Z"
+    },
+    "defect": {
+        "DEFAULT": "D IY1 F EH0 K T",
+        "VERB": "D IH0 F EH1 K T"
+    },
+    "defects": {
+        "DEFAULT": "D IY1 F EH0 K T S",
+        "VERB": "D IH0 F EH1 K T S"
+    },
+    "degenerate": {
+        "DEFAULT": "D IH0 JH EH1 N ER0 AH0 T",
+        "VERB": "D IH0 JH EH1 N ER0 EY2 T"
+    },
+    "degenerates": {
+        "DEFAULT": "D IH0 JH EH1 N ER0 AH0 T S",
+        "VERB": "D IH0 JH EH1 N ER0 EY2 T S"
+    },
+    "delegate": {
+        "DEFAULT": "D EH1 L AH0 G AH0 T",
+        "VERB": "D EH1 L AH0 G EY2 T"
+    },
+    "delegates": {
+        "DEFAULT": "D EH1 L AH0 G AH0 T S",
+        "VERB": "D EH1 L AH0 G EY2 T S"
+    },
+    "deliberate": {
+        "DEFAULT": "D IH0 L IH1 B ER0 AH0 T",
+        "VERB": "D IH0 L IH1 B ER0 EY2 T"
+    },
+    "desert": {
+        "DEFAULT": "D EH1 Z ER0 T",
+        "VERB": "D IH0 Z ER1 T"
+    },
+    "deserts": {
+        "DEFAULT": "D EH1 Z ER0 T S",
+        "VERB": "D IH0 Z ER1 T S"
+    },
+    "desolate": {
+        "DEFAULT": "D EH1 S AH0 L AH0 T",
+        "VERB": "D EH1 S AH0 L EY2 T"
+    },
+    "diagnoses": {
+        "DEFAULT": "D AY2 AH0 G N OW1 S IY0 Z",
+        "VERB": "D AY1 AH0 G N OW2 Z IY0 Z"
+    },
+    "dictate": {
+        "DEFAULT": "D IH1 K T EY2 T",
+        "VERB": "D IH0 K T EY1 T"
+    },
+    "dictates": {
+        "DEFAULT": "D IH1 K T EY2 T S",
+        "VERB": "D IH0 K T EY1 T S"
+    },
+    "diffuse": {
+        "DEFAULT": "D IH0 F Y UW1 S",
+        "VERB": "D IH0 F Y UW1 Z"
+    },
+    "digest": {
+        "DEFAULT": "D AY1 JH EH0 S T",
+        "VERB": "D AY0 JH EH1 S T"
+    },
+    "digests": {
+        "DEFAULT": "D AY1 JH EH0 S T S",
+        "VERB": "D AY2 JH EH1 S T S"
+    },
+    "discard": {
+        "DEFAULT": "D IH1 S K AA0 R D",
+        "VERB": "D IH0 S K AA1 R D"
+    },
+    "discards": {
+        "DEFAULT": "D IH1 S K AA0 R D Z",
+        "VERB": "D IH0 S K AA1 R D Z"
+    },
+    "discharge": {
+        "DEFAULT": "D IH1 S CH AA2 R JH",
+        "VERB": "D IH0 S CH AA1 R JH"
+    },
+    "discharges": {
+        "DEFAULT": "D IH1 S CH AA2 R JH AH0 Z",
+        "VERB": "D IH0 S CH AA1 R JH AH0 Z"
+    },
+    "discount": {
+        "DEFAULT": "D IH1 S K AW0 N T",
+        "VERB": "D IH0 S K AW1 N T"
+    },
+    "discounts": {
+        "DEFAULT": "D IH1 S K AW2 N T S",
+        "VERB": "D IH0 S K AW1 N T S"
+    },
+    "discourse": {
+        "DEFAULT": "D IH1 S K AO0 R S",
+        "VERB": "D IH0 S K AO1 R S"
+    },
+    "discourses": {
+        "DEFAULT": "D IH1 S K AO0 R S IH0 Z",
+        "VERB": "D IH0 S K AO1 R S IH0 Z"
+    },
+    "document": {
+        "DEFAULT": "D AA1 K Y AH0 M AH0 N T",
+        "VERB": "D AA1 K Y UW0 M EH0 N T"
+    },
+    "documents": {
+        "DEFAULT": "D AA1 K Y AH0 M AH0 N T S",
+        "VERB": "D AA1 K Y UW0 M EH0 N T S"
+    },
+    "dogged": {
+        "DEFAULT": "D AO1 G D",
+        "VERB": "D AO1 G IH0 D"
+    },
+    "duplicate": {
+        "DEFAULT": "D UW1 P L AH0 K AH0 T",
+        "VERB": "D UW1 P L AH0 K EY2 T"
+    },
+    "duplicates": {
+        "DEFAULT": "D UW1 P L AH0 K AH0 T S",
+        "VERB": "D UW1 P L AH0 K EY2 T S"
+    },
+    "ejaculate": {
+        "DEFAULT": "IH0 JH AE1 K Y UW0 L AH0 T",
+        "VERB": "IH0 JH AE1 K Y UW0 L EY2 T"
+    },
+    "ejaculates": {
+        "DEFAULT": "IH0 JH AE1 K Y UW0 L AH0 T S",
+        "VERB": "IH0 JH AE1 K Y UW0 L EY2 T S"
+    },
+    "elaborate": {
+        "DEFAULT": "IH0 L AE1 B R AH0 T",
+        "VERB": "IH0 L AE1 B ER0 EY2 T"
+    },
+    "entrance": {
+        "DEFAULT": "EH1 N T R AH0 N S",
+        "VERB": "IH0 N T R AH1 N S"
+    },
+    "entrances": {
+        "DEFAULT": "EH1 N T R AH0 N S AH0 Z",
+        "VERB": "IH0 N T R AH1 N S AH0 Z"
+    },
+    "envelope": {
+        "DEFAULT": "EH1 N V AH0 L OW2 P",
+        "VERB": "IH0 N V EH1 L AH0 P"
+    },
+    "envelopes": {
+        "DEFAULT": "EH1 N V AH0 L OW2 P S",
+        "VERB": "IH0 N V EH1 L AH0 P S"
+    },
+    "escort": {
+        "DEFAULT": "EH1 S K AO0 R T",
+        "VERB": "EH0 S K AO1 R T"
+    },
+    "escorts": {
+        "DEFAULT": "EH1 S K AO0 R T S",
+        "VERB": "EH0 S K AO1 R T S"
+    },
+    "essay": {
+        "DEFAULT": "EH1 S EY2",
+        "VERB": "EH0 S EY1"
+    },
+    "essays": {
+        "DEFAULT": "EH1 S EY2 Z",
+        "VERB": "EH0 S EY1 Z"
+    },
+    "estimate": {
+        "DEFAULT": "EH1 S T AH0 M AH0 T",
+        "VERB": "EH1 S T AH0 M EY2 T"
+    },
+    "estimates": {
+        "DEFAULT": "EH1 S T AH0 M AH0 T S",
+        "VERB": "EH1 S T AH0 M EY2 T S"
+    },
+    "excess": {
+        "DEFAULT": "EH1 K S EH2 S",
+        "VERB": "IH0 K S EH1 S"
+    },
+    "excise": {
+        "DEFAULT": "EH1 K S AY0 Z",
+        "VERB": "EH0 K S AY1 S"
+    },
+    "excuse": {
+        "DEFAULT": "IH0 K S K Y UW1 S",
+        "VERB": "IH0 K S K Y UW1 Z"
+    },
+    "excuses": {
+        "DEFAULT": "IH0 K S K Y UW1 S IH0 Z",
+        "VERB": "IH0 K S K Y UW1 Z IH0 Z"
+    },
+    "expatriate": {
+        "DEFAULT": "EH0 K S P EY1 T R IY0 AH0 T",
+        "VERB": "EH0 K S P EY1 T R IY0 EY2 T"
+    },
+    "expatriates": {
+        "DEFAULT": "EH0 K S P EY1 T R IY0 AH0 T S",
+        "VERB": "EH0 K S P EY1 T R IY0 EY2 T S"
+    },
+    "exploit": {
+        "DEFAULT": "EH2 K S P L OY1 T",
+        "VERB": "EH1 K S P L OY2 T"
+    },
+    "exploits": {
+        "DEFAULT": "EH2 K S P L OY1 T S",
+        "VERB": "EH1 K S P L OY2 T S"
+    },
+    "export": {
+        "DEFAULT": "EH1 K S P AO0 R T",
+        "VERB": "IH0 K S P AO1 R T"
+    },
+    "exports": {
+        "DEFAULT": "EH1 K S P AO0 R T S",
+        "VERB": "IH0 K S P AO1 R T S"
+    },
+    "extract": {
+        "DEFAULT": "EH1 K S T R AE2 K T",
+        "VERB": "IH0 K S T R AE1 K T"
+    },
+    "extracts": {
+        "DEFAULT": "EH1 K S T R AE2 K T S",
+        "VERB": "IH0 K S T R AE1 K T S"
+    },
+    "ferment": {
+        "DEFAULT": "F ER1 M EH0 N T",
+        "VERB": "F ER0 M EH1 N T"
+    },
+    "ferments": {
+        "DEFAULT": "F ER1 M EH0 N T S",
+        "VERB": "F ER0 M EH1 N T S"
+    },
+    "fragment": {
+        "DEFAULT": "F R AE0 G M EH1 N T",
+        "VERB": "F R AE1 G M AH0 N T"
+    },
+    "fragments": {
+        "DEFAULT": "F R AE1 G M AH0 N T S",
+        "VERB": "F R AE0 G M EH1 N T S"
+    },
+    "frequent": {
+        "DEFAULT": "F R IY1 K W AH0 N T",
+        "VERB": "F R IY1 K W EH2 N T"
+    },
+    "graduate": {
+        "DEFAULT": "G R AE1 JH AH0 W AH0 T",
+        "VERB": "G R AE1 JH AH0 W EY2 T"
+    },
+    "graduates": {
+        "DEFAULT": "G R AE1 JH AH0 W AH0 T S",
+        "VERB": "G R AE1 JH AH0 W EY2 T S"
+    },
+    "house": {
+        "DEFAULT": "HH AW1 S",
+        "VERB": "HH AW1 Z"
+    },
+    "impact": {
+        "DEFAULT": "IH1 M P AE0 K T",
+        "VERB": "IH2 M P AE1 K T"
+    },
+    "impacts": {
+        "DEFAULT": "IH1 M P AE0 K T S",
+        "VERB": "IH2 M P AE1 K T S"
+    },
+    "implant": {
+        "DEFAULT": "IH1 M P L AE2 N T",
+        "VERB": "IH2 M P L AE1 N T"
+    },
+    "implants": {
+        "DEFAULT": "IH1 M P L AE2 N T S",
+        "VERB": "IH2 M P L AE1 N T S"
+    },
+    "implement": {
+        "DEFAULT": "IH1 M P L AH0 M AH0 N T",
+        "VERB": "IH1 M P L AH0 M EH0 N T"
+    },
+    "implements": {
+        "DEFAULT": "IH1 M P L AH0 M AH0 N T S",
+        "VERB": "IH1 M P L AH0 M EH0 N T S"
+    },
+    "import": {
+        "DEFAULT": "IH1 M P AO2 R T",
+        "VERB": "IH2 M P AO1 R T"
+    },
+    "imports": {
+        "DEFAULT": "IH1 M P AO2 R T S",
+        "VERB": "IH2 M P AO1 R T S"
+    },
+    "impress": {
+        "DEFAULT": "IH1 M P R EH0 S",
+        "VERB": "IH0 M P R EH1 S"
+    },
+    "imprint": {
+        "DEFAULT": "IH2 M P R IH1 N T",
+        "VERB": "IH1 M P R IH0 N T"
+    },
+    "imprints": {
+        "DEFAULT": "IH1 M P R IH0 N T S",
+        "VERB": "IH2 M P R IH1 N T S"
+    },
+    "incense": {
+        "DEFAULT": "IH1 N S EH2 N S",
+        "VERB": "IH2 N S EH1 N S"
+    },
+    "incline": {
+        "DEFAULT": "IH1 N K L AY0 N",
+        "VERB": "IH2 N K L AY1 N"
+    },
+    "inclines": {
+        "DEFAULT": "IH1 N K L AY0 N Z",
+        "VERB": "IH2 N K L AY1 N Z"
+    },
+    "incorporate": {
+        "DEFAULT": "IH2 N K AO1 R P ER0 AH0 T",
+        "VERB": "IH2 N K AO1 R P ER0 EY2 T"
+    },
+    "increase": {
+        "DEFAULT": "IH1 N K R IY2 S",
+        "VERB": "IH2 N K R IY1 S"
+    },
+    "increases": {
+        "DEFAULT": "IH1 N K R IY2 S IH0 Z",
+        "VERB": "IH2 N K R IY1 S IH0 Z"
+    },
+    "indent": {
+        "DEFAULT": "IH1 N D EH0 N T",
+        "VERB": "IH2 N D EH1 N T"
+    },
+    "indents": {
+        "DEFAULT": "IH1 N D EH0 N T S",
+        "VERB": "IH2 N D EH1 N T S"
+    },
+    "inebriate": {
+        "DEFAULT": "IH2 N EH1 B R IY0 AH0 T",
+        "VERB": "IH2 N EH1 B R IY0 EY2 T"
+    },
+    "inebriates": {
+        "DEFAULT": "IH2 N EH1 B R IY0 AH0 T S",
+        "VERB": "IH2 N EH1 B R IY0 EY2 T S"
+    },
+    "initiate": {
+        "DEFAULT": "IH2 N IH1 SH IY0 AH0 T",
+        "VERB": "IH2 N IH1 SH IY0 EY2 T"
+    },
+    "initiates": {
+        "DEFAULT": "IH2 N IH1 SH IY0 AH0 T S",
+        "VERB": "IH2 N IH1 SH IY0 EY2 T S"
+    },
+    "inlay": {
+        "DEFAULT": "IH1 N L EY2",
+        "VERB": "IH2 N L EY1"
+    },
+    "inlays": {
+        "DEFAULT": "IH1 N L EY2 Z",
+        "VERB": "IH2 N L EY1 Z"
+    },
+    "insert": {
+        "DEFAULT": "IH1 N S ER2 T",
+        "VERB": "IH2 N S ER1 T"
+    },
+    "inserts": {
+        "DEFAULT": "IH1 N S ER2 T S",
+        "VERB": "IH2 N S ER1 T S"
+    },
+    "inset": {
+        "DEFAULT": "IH1 N S EH2 T",
+        "VERB": "IH2 N S EH1 T"
+    },
+    "insets": {
+        "DEFAULT": "IH1 N S EH2 T S",
+        "VERB": "IH2 N S EH1 T S"
+    },
+    "instinct": {
+        "DEFAULT": "IH1 N S T IH0 NG K T",
+        "VERB": "IH2 N S T IH1 NG K T"
+    },
+    "insult": {
+        "DEFAULT": "IH1 N S AH2 L T",
+        "VERB": "IH2 N S AH1 L T"
+    },
+    "insults": {
+        "DEFAULT": "IH1 N S AH2 L T S",
+        "VERB": "IH2 N S AH1 L T S"
+    },
+    "interchange": {
+        "DEFAULT": "IH1 N T ER0 CH EY2 N JH",
+        "VERB": "IH2 T ER0 CH EY1 N JH"
+    },
+    "interchanges": {
+        "DEFAULT": "IH1 N T ER0 CH EY2 N JH IH0 Z",
+        "VERB": "IH2 T ER0 CH EY1 N JH IH0 Z"
+    },
+    "interdict": {
+        "DEFAULT": "IH1 N T ER0 D IH2 K T",
+        "VERB": "IH2 N T ER0 D IH1 K T"
+    },
+    "interdicts": {
+        "DEFAULT": "IH1 N T ER0 D IH2 K T S",
+        "VERB": "IH2 N T ER0 D IH1 K T S"
+    },
+    "intern": {
+        "DEFAULT": "IH1 N T ER0 N",
+        "VERB": "IH0 N T ER1 N"
+    },
+    "interns": {
+        "DEFAULT": "IH1 N T ER0 N Z",
+        "VERB": "IH0 N T ER1 N Z"
+    },
+    "intimate": {
+        "DEFAULT": "IH1 N T AH0 M AH0 T",
+        "VERB": "IH1 N T IH0 M EY2 T"
+    },
+    "intimates": {
+        "DEFAULT": "IH1 N T AH0 M AH0 T S",
+        "VERB": "IH1 N T IH0 M EY2 T S"
+    },
+    "intrigue": {
+        "DEFAULT": "IH1 N T R IY0 G",
+        "VERB": "IH2 N T R IY1 G"
+    },
+    "introvert": {
+        "DEFAULT": "IH1 N T R AO0 V ER2 T",
+        "VERB": "IH2 N T R AO0 V ER1 T"
+    },
+    "introverts": {
+        "DEFAULT": "IH1 N T R AO0 V ER2 T S",
+        "VERB": "IH2 N T R AO0 V ER1 T S"
+    },
+    "inverse": {
+        "DEFAULT": "IH2 N V ER1 S",
+        "VERB": "IH1 N V ER0 S"
+    },
+    "invite": {
+        "DEFAULT": "IH1 N V AY0 T",
+        "VERB": "IH2 N V AY1 T"
+    },
+    "invites": {
+        "DEFAULT": "IH1 N V AY0 T S",
+        "VERB": "IH2 N V AY1 T S"
+    },
+    "jagged": {
+        "DEFAULT": "JH AE1 G IH0 D",
+        "VERB": "JH AE1 G D"
+    },
+    "learned": {
+        "DEFAULT": "L ER1 N D",
+        "VERB": "L ER1 N IH0 D"
+    },
+    "legitimate": {
+        "DEFAULT": "L AH0 JH IH1 T AH0 M AH0 T",
+        "VERB": "L AH0 JH IH1 T AH0 M EY2 T"
+    },
+    "live": {
+        "DEFAULT": "L AY1 V",
+        "VERB": "L IH1 V"
+    },
+    "lives": {
+        "DEFAULT": "L AY1 V Z",
+        "VERB": "L IH1 V Z"
+    },
+    "mandate": {
+        "DEFAULT": "M AE2 N D EY1 T",
+        "VERB": "M AE1 N D EY2 T"
+    },
+    "misconduct": {
+        "DEFAULT": "M IH2 S K AA0 N D AH1 K T",
+        "VERB": "M IH2 S K AA1 N D AH0 K T"
+    },
+    "misprint": {
+        "DEFAULT": "M IH1 S P R IH0 N T",
+        "VERB": "M IH2 S P R IH1 N T"
+    },
+    "misprints": {
+        "DEFAULT": "M IH1 S P R IH0 N T S",
+        "VERB": "M IH2 S P R IH1 N T S"
+    },
+    "misuse": {
+        "DEFAULT": "M IH0 S Y UW1 Z",
+        "VERB": "M IH0 S Y UW1 S"
+    },
+    "misuses": {
+        "DEFAULT": "M IH0 S Y UW1 S IH0 Z",
+        "VERB": "M IH0 S Y UW1 Z IH0 Z"
+    },
+    "moderate": {
+        "DEFAULT": "M AA1 D ER0 AH0 T",
+        "VERB": "M AA1 D ER0 EY2 T"
+    },
+    "moderates": {
+        "DEFAULT": "M AA1 D ER0 AH0 T S",
+        "VERB": "M AA1 D ER0 EY2 T S"
+    },
+    "mouth": {
+        "DEFAULT": "M AW1 DH",
+        "VERB": "M AW1 TH"
+    },
+    "mouths": {
+        "DEFAULT": "M AW1 TH S",
+        "VERB": "M AW1 DH Z"
+    },
+    "object": {
+        "DEFAULT": "AA1 B JH EH0 K T",
+        "VERB": "AH0 B JH EH1 K T"
+    },
+    "objects": {
+        "DEFAULT": "AA1 B JH EH0 K T S",
+        "VERB": "AH0 B JH EH1 K T S"
+    },
+    "ornament": {
+        "DEFAULT": "AO1 R N AH0 M AH0 N T",
+        "VERB": "AO1 R N AH0 M EH0 N T"
+    },
+    "ornaments": {
+        "DEFAULT": "AO1 R N AH0 M AH0 N T S",
+        "VERB": "AO1 R N AH0 M EH0 N T S"
+    },
+    "overcharge": {
+        "DEFAULT": "OW1 V ER0 CH AA2 R JH",
+        "VERB": "OW2 V ER0 CH AA1 R JH"
+    },
+    "overcharges": {
+        "DEFAULT": "OW1 V ER0 CH AA2 R JH IH0 Z",
+        "VERB": "OW2 V ER0 CH AA1 R JH IH0 Z"
+    },
+    "overflow": {
+        "DEFAULT": "OW1 V ER0 F L OW2",
+        "VERB": "OW2 V ER0 F L OW1"
+    },
+    "overflows": {
+        "DEFAULT": "OW1 V ER0 F L OW2 Z",
+        "VERB": "OW2 V ER0 F L OW1 Z"
+    },
+    "overhang": {
+        "DEFAULT": "OW1 V ER0 HH AE2 NG",
+        "VERB": "OW2 V ER0 HH AE1 NG"
+    },
+    "overhangs": {
+        "DEFAULT": "OW1 V ER0 HH AE2 NG Z",
+        "VERB": "OW2 V ER0 HH AE1 NG Z"
+    },
+    "overhaul": {
+        "DEFAULT": "OW1 V ER0 HH AO2 L",
+        "VERB": "OW2 V ER0 HH AO1 L"
+    },
+    "overhauls": {
+        "DEFAULT": "OW1 V ER0 HH AO2 L Z",
+        "VERB": "OW2 V ER0 HH AO1 L Z"
+    },
+    "overlap": {
+        "DEFAULT": "OW1 V ER0 L AE2 P",
+        "VERB": "OW2 V ER0 L AE1 P"
+    },
+    "overlaps": {
+        "DEFAULT": "OW1 V ER0 L AE2 P S",
+        "VERB": "OW2 V ER0 L AE1 P S"
+    },
+    "overlay": {
+        "DEFAULT": "OW1 V ER0 L EY2",
+        "VERB": "OW2 V ER0 L EY1"
+    },
+    "overlays": {
+        "DEFAULT": "OW1 V ER0 L EY2 Z",
+        "VERB": "OW2 V ER0 L EY1 Z"
+    },
+    "overwork": {
+        "DEFAULT": "OW1 V ER0 W ER2 K",
+        "VERB": "OW2 V ER0 W ER1 K"
+    },
+    "perfect": {
+        "DEFAULT": "P ER1 F IH2 K T",
+        "VERB": "P ER0 F EH1 K T"
+    },
+    "perfume": {
+        "DEFAULT": "P ER1 F Y UW0 M",
+        "VERB": "P ER0 F Y UW1 M"
+    },
+    "perfumes": {
+        "DEFAULT": "P ER1 F Y UW0 M Z",
+        "VERB": "P ER0 F Y UW1 M Z"
+    },
+    "permit": {
+        "DEFAULT": "P ER1 M IH2 T",
+        "VERB": "P ER0 M IH1 T"
+    },
+    "permits": {
+        "DEFAULT": "P ER1 M IH2 T S",
+        "VERB": "P ER0 M IH1 T S"
+    },
+    "pervert": {
+        "DEFAULT": "P ER1 V ER0 T",
+        "VERB": "P ER0 V ER1 T"
+    },
+    "perverts": {
+        "DEFAULT": "P ER1 V ER0 T S",
+        "VERB": "P ER0 V ER1 T S"
+    },
+    "pontificate": {
+        "DEFAULT": "P AA0 N T IH1 F AH0 K EY2 T",
+        "VERB": "P AA0 N T IH1 F AH0 K AH0 T"
+    },
+    "pontificates": {
+        "DEFAULT": "P AA0 N T IH1 F AH0 K AH0 T S",
+        "VERB": "P AA0 N T IH1 F AH0 K EY2 T S"
+    },
+    "precipitate": {
+        "DEFAULT": "P R IH0 S IH1 P IH0 T EY2 T",
+        "VERB": "P R IH0 S IH1 P IH0 T AH0 T"
+    },
+    "predicate": {
+        "DEFAULT": "P R EH1 D AH0 K EY2 T",
+        "VERB": "P R EH1 D IH0 K AH0 T"
+    },
+    "predicates": {
+        "DEFAULT": "P R EH1 D IH0 K AH0 T S",
+        "VERB": "P R EH1 D AH0 K EY2 T S"
+    },
+    "prefix": {
+        "DEFAULT": "P R IY1 F IH0 K S",
+        "VERB": "P R IY2 F IH1 K S"
+    },
+    "prefixes": {
+        "DEFAULT": "P R IY1 F IH0 K S IH0 JH",
+        "VERB": "P R IY2 F IH1 K S IH0 JH"
+    },
+    "presage": {
+        "DEFAULT": "P R EH1 S IH0 JH",
+        "VERB": "P R EH2 S IH1 JH"
+    },
+    "presages": {
+        "DEFAULT": "P R EH1 S IH0 JH IH0 JH",
+        "VERB": "P R EH2 S IH1 JH IH0 JH"
+    },
+    "present": {
+        "DEFAULT": "P R EH1 Z AH0 N T",
+        "VERB": "P R IY0 Z EH1 N T"
+    },
+    "presents": {
+        "DEFAULT": "P R EH1 Z AH0 N T S",
+        "VERB": "P R IY0 Z EH1 N T S"
+    },
+    "proceeds": {
+        "DEFAULT": "P R OW1 S IY0 D Z",
+        "VERB": "P R AH0 S IY1 D Z"
+    },
+    "process": {
+        "DEFAULT": "P R AA1 S EH2 S",
+        "VERB": "P R AO2 S EH1 S"
+    },
+    "processes": {
+        "DEFAULT": "P R AO2 S EH1 S AH0 Z",
+        "VERB": "P R AA1 S EH0 S AH0 Z"
+    },
+    "processing": {
+        "DEFAULT": "P R AA1 S EH0 S IH0 NG",
+        "VERB": "P R AA0 S EH1 S IH0 NG"
+    },
+    "produce": {
+        "DEFAULT": "P R OW1 D UW0 S",
+        "VERB": "P R AH0 D UW1 S"
+    },
+    "progress": {
+        "DEFAULT": "P R AA1 G R EH2 S",
+        "VERB": "P R AH0 G R EH1 S"
+    },
+    "progresses": {
+        "DEFAULT": "P R AA1 G R EH2 S AH0 Z",
+        "VERB": "P R OW0 G R EH1 S AH0 Z"
+    },
+    "project": {
+        "DEFAULT": "P R AA1 JH EH0 K T",
+        "VERB": "P R AA0 JH EH1 K T"
+    },
+    "projects": {
+        "DEFAULT": "P R AA1 JH EH0 K T S",
+        "VERB": "P R AA0 JH EH1 K T S"
+    },
+    "prospect": {
+        "DEFAULT": "P R AA1 S P EH0 K T",
+        "VERB": "P R AH2 S P EH1 K T"
+    },
+    "prospects": {
+        "DEFAULT": "P R AA1 S P EH0 K T S",
+        "VERB": "P R AH2 S P EH1 K T S"
+    },
+    "prostrate": {
+        "DEFAULT": "P R AA1 S T R EY0 T",
+        "VERB": "P R AA0 S T R EY1 T"
+    },
+    "protest": {
+        "DEFAULT": "P R OW1 T EH2 S T",
+        "VERB": "P R AH0 T EH1 S T"
+    },
+    "protests": {
+        "DEFAULT": "P R OW1 T EH2 S T S",
+        "VERB": "P R AH0 T EH1 S T S"
+    },
+    "purport": {
+        "DEFAULT": "P ER1 P AO2 R T",
+        "VERB": "P ER0 P AO1 R T"
+    },
+    "quadruple": {
+        "DEFAULT": "K W AA0 D R UW1 P AH0 L",
+        "VERB": "K W AA1 D R UW0 P AH0 L"
+    },
+    "quadruples": {
+        "DEFAULT": "K W AA1 D R UW0 P AH0 L Z",
+        "VERB": "K W AA0 D R UW1 P AH0 L Z"
+    },
+    "ragged": {
+        "DEFAULT": "R AE1 G AH0 D",
+        "VERB": "R AE1 G D"
+    },
+    "rampage": {
+        "DEFAULT": "R AE1 M P EY2 JH",
+        "VERB": "R AE2 M P EY1 JH"
+    },
+    "rampages": {
+        "DEFAULT": "R AE1 M P EY2 JH IH0 Z",
+        "VERB": "R AE2 M P EY1 JH IH0 Z"
+    },
+    "read": {
+        "DEFAULT": "R IY1 D",
+        "VBD": "R EH1 D",
+        "VBN": "R EH1 D",
+        "VBP": "R EH1 D"
+    },
+    "rebel": {
+        "DEFAULT": "R IH0 B EH1 L",
+        "VERB": "R EH1 B AH0 L"
+    },
+    "rebels": {
+        "DEFAULT": "R EH1 B AH0 L Z",
+        "VERB": "R IH0 B EH1 L Z"
+    },
+    "rebound": {
+        "DEFAULT": "R IY1 B AW0 N D",
+        "VERB": "R IY0 B AW1 N D"
+    },
+    "rebounds": {
+        "DEFAULT": "R IY1 B AW0 N D Z",
+        "VERB": "R IY0 B AW1 N D Z"
+    },
+    "recall": {
+        "DEFAULT": "R IY1 K AO2 L",
+        "VERB": "R IH0 K AO1 L"
+    },
+    "recalls": {
+        "DEFAULT": "R IY1 K AO2 L Z",
+        "VERB": "R IH0 K AO1 L Z"
+    },
+    "recap": {
+        "DEFAULT": "R IY1 K AE2 P",
+        "VERB": "R IH0 K AE1 P"
+    },
+    "recapped": {
+        "DEFAULT": "R IY1 K AE2 P T",
+        "VERB": "R IH0 K AE1 P T"
+    },
+    "recapping": {
+        "DEFAULT": "R IY1 K AE2 P IH0 NG",
+        "VERB": "R IH0 K AE1 P IH0 NG"
+    },
+    "recaps": {
+        "DEFAULT": "R IY1 K AE2 P S",
+        "VERB": "R IH0 K AE1 P S"
+    },
+    "record": {
+        "DEFAULT": "R EH1 K ER0 D",
+        "VERB": "R IH0 K AO1 R D"
+    },
+    "records": {
+        "DEFAULT": "R EH1 K ER0 D Z",
+        "VERB": "R IH0 K AO1 R D Z"
+    },
+    "recount": {
+        "DEFAULT": " R IH1 K AW0 N T",
+        "VERB": "R IY2 K AW1 N T"
+    },
+    "recounts": {
+        "DEFAULT": " R IH1 K AW0 N T S",
+        "VERB": "R IY2 K AW1 N T S"
+    },
+    "refill": {
+        "DEFAULT": "R IY1 F IH0 L",
+        "VERB": "R IY0 F IH1 L"
+    },
+    "refills": {
+        "DEFAULT": "R IY1 F IH0 L Z",
+        "VERB": "R IY0 F IH1 L Z"
+    },
+    "refit": {
+        "DEFAULT": "R IY1 F IH0 T",
+        "VERB": "R IY0 F IH1 T"
+    },
+    "refits": {
+        "DEFAULT": "R IY1 F IH0 T S",
+        "VERB": "R IY0 F IH1 T S"
+    },
+    "refresh": {
+        "DEFAULT": "R IH1 F R EH0 SH",
+        "VERB": "R IH0 F R EH1 SH"
+    },
+    "refund": {
+        "DEFAULT": "R IY1 F AH2 N D",
+        "VERB": "R IH0 F AH1 N D"
+    },
+    "refunds": {
+        "DEFAULT": "R IY1 F AH2 N D Z",
+        "VERB": "R IH0 F AH1 N D Z"
+    },
+    "refuse": {
+        "DEFAULT": "R EH1 F Y UW2 Z",
+        "VERB": "R IH0 F Y UW1 Z"
+    },
+    "regenerate": {
+        "DEFAULT": "R IY0 JH EH1 N ER0 AH0 T",
+        "VERB": "R IY0 JH EH1 N ER0 EY2 T"
+    },
+    "rehash": {
+        "DEFAULT": "R IY1 HH AE0 SH",
+        "VERB": "R IY0 HH AE1 SH"
+    },
+    "rehashes": {
+        "DEFAULT": "R IY1 HH AE0 SH IH0 Z",
+        "VERB": "R IY0 HH AE1 SH IH0 Z"
+    },
+    "reincarnate": {
+        "DEFAULT": "R IY2 IH0 N K AA1 R N AH0 T",
+        "VERB": "R IY2 IH0 N K AA1 R N EY2 T"
+    },
+    "reject": {
+        "DEFAULT": "R IY1 JH EH0 K T",
+        "VERB": "R IH0 JH EH1 K T"
+    },
+    "rejects": {
+        "DEFAULT": "R IY1 JH EH0 K T S",
+        "VERB": "R IH0 JH EH1 K T S"
+    },
+    "relay": {
+        "DEFAULT": "R IY1 L EY2",
+        "VERB": "R IY2 L EY1"
+    },
+    "relaying": {
+        "DEFAULT": "R IY1 L EY2 IH0 NG",
+        "VERB": "R IY2 L EY1 IH0 NG"
+    },
+    "relays": {
+        "DEFAULT": "R IY1 L EY2 Z",
+        "VERB": "R IY2 L EY1 Z"
+    },
+    "remake": {
+        "DEFAULT": "R IY1 M EY0 K",
+        "VERB": "R IY2 M EY1 K"
+    },
+    "remakes": {
+        "DEFAULT": "R IY1 M EY0 K S",
+        "VERB": "R IY2 M EY1 K S"
+    },
+    "replay": {
+        "DEFAULT": "R IY1 P L EY0",
+        "VERB": "R IY0 P L EY1"
+    },
+    "replays": {
+        "DEFAULT": "R IY1 P L EY0 Z",
+        "VERB": "R IY0 P L EY1 Z"
+    },
+    "reprint": {
+        "DEFAULT": "R IY1 P R IH0 N T",
+        "VERB": "R IY0 P R IH1 N T"
+    },
+    "reprints": {
+        "DEFAULT": "R IY1 P R IH0 N T S",
+        "VERB": "R IY0 P R IH1 N T S"
+    },
+    "rerun": {
+        "DEFAULT": "R IY1 R AH0 N",
+        "VERB": "R IY2 R AH1 N"
+    },
+    "reruns": {
+        "DEFAULT": "R IY1 R AH0 N Z",
+        "VERB": "R IY2 R AH1 N Z"
+    },
+    "resume": {
+        "DEFAULT": "R EH1 Z AH0 M EY2",
+        "VERB": "R IY0 Z UW1 M"
+    },
+    "retake": {
+        "DEFAULT": "R IY1 T EY0 K",
+        "VERB": "R IY0 T EY1 K"
+    },
+    "retakes": {
+        "DEFAULT": "R IY1 T EY0 K S",
+        "VERB": "R IY0 T EY1 K S"
+    },
+    "rethink": {
+        "DEFAULT": "R IY1 TH IH0 NG K",
+        "VERB": "R IY2 TH IH1 NG K"
+    },
+    "rethinks": {
+        "DEFAULT": "R IY1 TH IH0 NG K S",
+        "VERB": "R IY2 TH IH1 NG K S"
+    },
+    "retread": {
+        "DEFAULT": "R IY1 T R EH0 D",
+        "VERB": "R IY2 T R EH1 D"
+    },
+    "retreads": {
+        "DEFAULT": "R IY1 T R EH0 D Z",
+        "VERB": "R IY2 T R EH1 D Z"
+    },
+    "rewrite": {
+        "DEFAULT": "R IY1 R AY2 T",
+        "VERB": "R IY0 R AY1 T"
+    },
+    "rewrites": {
+        "DEFAULT": "R IY1 R AY2 T S",
+        "VERB": "R IY0 R AY1 T S"
+    },
+    "segment": {
+        "DEFAULT": "S EH2 G M EH1 N T",
+        "VERB": "S EH1 G M AH0 N T"
+    },
+    "segments": {
+        "DEFAULT": "S EH1 G M AH0 N T S",
+        "VERB": "S EH2 G M EH1 N T S"
+    },
+    "separate": {
+        "DEFAULT": "S EH1 P ER0 IH0 T",
+        "VERB": "S EH1 P ER0 EY2 T"
+    },
+    "separates": {
+        "DEFAULT": "S EH1 P ER0 IH0 T S",
+        "VERB": "S EH1 P ER0 EY2 T S"
+    },
+    "subcontract": {
+        "DEFAULT": "S AH2 B K AA0 N T R AE1 K T",
+        "VERB": "S AH0 B K AA1 N T R AE2 K T"
+    },
+    "subcontracts": {
+        "DEFAULT": "S AH0 B K AA1 N T R AE2 K T S",
+        "VERB": "S AH2 B K AA0 N T R AE1 K T S"
+    },
+    "subject": {
+        "DEFAULT": "S AH1 B JH IH0 K T",
+        "VERB": "S AH0 B JH EH1 K T"
+    },
+    "subjects": {
+        "DEFAULT": "S AH1 B JH IH0 K T S",
+        "VERB": "S AH0 B JH EH1 K T S"
+    },
+    "subordinate": {
+        "DEFAULT": "S AH0 B AO1 R D AH0 N AH0 T",
+        "VERB": "S AH0 B AO1 R D AH0 N EY2 T"
+    },
+    "subordinates": {
+        "DEFAULT": "S AH0 B AO1 R D AH0 N AH0 T S",
+        "VERB": "S AH0 B AO1 R D AH0 N EY2 T S"
+    },
+    "supplement": {
+        "DEFAULT": "S AH1 P L AH0 M AH0 N T",
+        "VERB": "S AH1 P L AH0 M EH0 N T"
+    },
+    "supplements": {
+        "DEFAULT": "S AH1 P L AH0 M AH0 N T S",
+        "VERB": "S AH1 P L AH0 M EH0 N T S"
+    },
+    "surmise": {
+        "DEFAULT": "S ER1 M AY0 Z",
+        "VERB": "S ER0 M AY1 Z"
+    },
+    "surmises": {
+        "DEFAULT": "S ER1 M AY0 Z IH0 Z",
+        "VERB": "S ER0 M AY1 Z IH0 Z"
+    },
+    "survey": {
+        "DEFAULT": "S ER1 V EY2",
+        "VERB": "S ER0 V EY1"
+    },
+    "surveys": {
+        "DEFAULT": "S ER1 V EY2 Z",
+        "VERB": "S ER0 V EY1 Z"
+    },
+    "suspect": {
+        "DEFAULT": "S AH1 S P EH2 K T",
+        "VERB": "S AH0 S P EH1 K T"
+    },
+    "suspects": {
+        "DEFAULT": "S AH1 S P EH2 K T S",
+        "VERB": "S AH0 S P EH1 K T S"
+    },
+    "syndicate": {
+        "DEFAULT": "S IH1 N D IH0 K AH0 T",
+        "VERB": "S IH1 N D AH0 K EY2 T"
+    },
+    "syndicates": {
+        "DEFAULT": "S IH1 N D IH0 K AH0 T S",
+        "VERB": "S IH1 N D IH0 K EY2 T S"
+    },
+    "torment": {
+        "DEFAULT": "T AO0 R M EH1 N T",
+        "VERB": "T AO1 R M EH2 N T"
+    },
+    "transfer": {
+        "DEFAULT": "T R AE1 N S F ER0",
+        "VERB": "T R AE0 N S F ER1"
+    },
+    "transfers": {
+        "DEFAULT": "T R AE1 N S F ER0 Z",
+        "VERB": "T R AE0 N S F ER1 Z"
+    },
+    "transplant": {
+        "DEFAULT": "T R AE1 N S P L AE0 N T",
+        "VERB": "T R AE0 N S P L AE1 N T"
+    },
+    "transplants": {
+        "DEFAULT": "T R AE1 N S P L AE0 N T S",
+        "VERB": "T R AE0 N S P L AE1 N T S"
+    },
+    "transport": {
+        "DEFAULT": "T R AE1 N S P AO0 R T",
+        "VERB": "T R AE0 N S P AO1 R T"
+    },
+    "transports": {
+        "DEFAULT": "T R AE1 N S P AO0 R T S",
+        "VERB": "T R AE0 N S P AO1 R T S"
+    },
+    "triplicate": {
+        "DEFAULT": "T R IH1 P L IH0 K AH0 T",
+        "VERB": "T R IH1 P L IH0 K EY2 T"
+    },
+    "triplicates": {
+        "DEFAULT": "T R IH1 P L IH0 K AH0 T S",
+        "VERB": "T R IH1 P L IH0 K EY2 T S"
+    },
+    "undercut": {
+        "DEFAULT": "AH1 N D ER0 K AH2 T",
+        "VERB": "AH2 N D ER0 K AH1 T"
+    },
+    "underestimate": {
+        "DEFAULT": "AH1 N D ER0 EH1 S T AH0 M AH0 T",
+        "VERB": "AH1 N D ER0 EH1 S T AH0 M EY2 T"
+    },
+    "underestimates": {
+        "DEFAULT": "AH1 N D ER0 EH1 S T AH0 M AH0 T S",
+        "VERB": "AH1 N D ER0 EH1 S T AH0 M EY2 T S"
+    },
+    "underline": {
+        "DEFAULT": "AH1 N D ER0 L AY2 N",
+        "VERB": "AH2 N D ER0 L AY1 N"
+    },
+    "underlines": {
+        "DEFAULT": "AH1 N D ER0 L AY2 N Z",
+        "VERB": "AH2 N D ER0 L AY1 N Z"
+    },
+    "undertaking": {
+        "DEFAULT": "AH1 N D ER0 T EY2 K IH0 NG",
+        "VERB": "AH2 N D ER0 T EY1 K IH0 NG"
+    },
+    "undertakings": {
+        "DEFAULT": "AH1 N D ER0 T EY2 K IH0 NG Z",
+        "VERB": "AH2 N D ER0 T EY1 K IH0 NG Z"
+    },
+    "unused": {
+        "DEFAULT": "AH0 N Y UW1 S T",
+        "VERB": "AH0 N Y UW1 Z D"
+    },
+    "upgrade": {
+        "DEFAULT": "AH1 P G R EY0 D",
+        "VERB": "AH0 P G R EY1 D"
+    },
+    "upgrades": {
+        "DEFAULT": "AH1 P G R EY0 D Z",
+        "VERB": "AH0 P G R EY1 D Z"
+    },
+    "uplift": {
+        "DEFAULT": "AH1 P L IH0 F T",
+        "VERB": "AH2 P L IH1 F T"
+    },
+    "upset": {
+        "DEFAULT": "AH1 P S EH2 T",
+        "VERB": "AH0 P S EH1 T"
+    },
+    "upsets": {
+        "DEFAULT": "AH1 P S EH2 T S",
+        "VERB": "AH0 P S EH1 T S"
+    },
+    "use": {
+        "DEFAULT": "Y UW1 S",
+        "VERB": "Y UW1 Z"
+    },
+    "used": {
+        "DEFAULT": "Y UW1 S T",
+        "VBN": "Y UW1 Z D"
+    },
+    "uses": {
+        "DEFAULT": "Y UW1 S IH0 Z",
+        "VERB": "Y UW1 Z IH0 Z"
+    }
+}

resources/app/python/xvapitch/text/h2p_parser/data/example.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "absent": {
+        "VERB": "AH1 B S AE1 N T",
+        "DEFAULT": "AE1 B S AH0 N T"
+    },
+    "reject": {
+        "VERB": "R IH0 JH EH1 K T",
+        "DEFAULT": "R IY1 JH EH0 K T"
+    },
+    "read": {
+        "VBD": "R EH1 D",
+        "VBN": "R EH1 D",
+        "VBP": "R EH1 D",
+        "DEFAULT": "R IY1 D"
+    }
+}

resources/app/python/xvapitch/text/h2p_parser/dict_reader.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# This reads a CMUDict formatted dictionary as a dictionary object
+import re
+import h2p_parser.format_ph as ph
+from . import DATA_PATH
+_dict_primary = 'cmudict.dict'
+def read_dict(filename: str) -> list:
+    # Read the file
+    with open(filename, encoding='utf-8', mode='r') as f:
+        # Read the file into lines
+        lines = f.readlines()
+    # Remove any line starting with ";;;"
+    lines = [line for line in lines if not line.startswith(';;;')]
+    return lines
+def parse_dict(lines: list) -> dict:
+    # Create a dictionary to store the parsed data
+    parsed_dict = {}
+    # Detect file format
+    # We will read the first 10 lines to determine the format
+    # Default to SSD format unless we find otherwise
+    dict_form = 'SSD'
+    for line in lines[:10]:
+        # Strip new lines
+        line = line.strip()
+        if line == '':
+            continue
+        """
+        Format 1 (Double Space Delimited):
+        - Comment allowed to start with ";;;"
+        WORD  W ER1 D
+        Format 2 (Single Space Delimited):
+        - Comment allowed at end of any line using "#"
+        WORD W ER1 D # Comment
+        """
+        if '  ' in line:
+            dict_form = 'DSD'
+            break
+    # Iterate over the lines
+    for line in lines:
+        # Skip empty lines and lines with no space
+        line = line.strip()
+        if line == '' and ' ' not in line:
+            continue
+        # Split depending on format
+        if dict_form == 'DSD':
+            pairs = line.split('  ')
+        else:
+            space_index = line.find(' ')
+            line_split = line[:space_index], line[space_index + 1:]
+            pairs = line_split[0], line_split[1].split('#')[0]
+        word = str.lower(pairs[0])  # Get word and lowercase it
+        phonemes = ph.to_list(pairs[1])   # Convert to list of phonemes
+        phonemes = [phonemes]  # Wrap in nested list
+        word_num = 0
+        word_orig = None
+        # Detect if this is a multi-word entry
+        if ('(' in word) and (')' in word) and any(char.isdigit() for char in word):
+            # Parse the integer from the word using regex
+            result = int(re.findall(r"\((\d+)\)", word)[0])
+            # If found
+            if result is not None:
+                # Set the original word
+                word_orig = word
+                # Remove the integer and bracket from the word
+                word = re.sub(r"\(.*\)", "", word)
+                # Set the word number to the result
+                word_num = result
+        # Check existing key
+        if word in parsed_dict:
+            # If word number is 0, ignore
+            if word_num == 0:
+                continue
+            # If word number is not 0, add phoneme to existing key at index
+            parsed_dict[word].extend(phonemes)
+            # Also add the original word if it exists
+            if word_orig is not None:
+                parsed_dict[word_orig] = phonemes
+        else:
+            # Create a new key
+            parsed_dict[word] = phonemes
+    # Return the dictionary
+    return parsed_dict
+class DictReader:
+    def __init__(self, filename=None):
+        self.filename = filename
+        self.dict = {}
+        # If filename is None, use the default dictionary
+        # default = 'data' uses the dictionary file in the data module
+        # default = 'nltk' uses the nltk cmudict
+        if filename is not None:
+            self.dict = parse_dict(read_dict(filename))
+        else:
+            with DATA_PATH.joinpath(_dict_primary) as f:
+                self.dict = parse_dict(read_dict(f))

resources/app/python/xvapitch/text/h2p_parser/dictionary.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# dictionary.py
+# Defines a dictionary class that can be used to store and retrieve from the json file
+import sys
+if sys.version_info < (3, 9):
+    # In Python versions below 3.9, this is needed
+    import importlib_resources as pkg_resources
+else:
+    # Since python 3.9+, importlib.resources.files is built-in
+    import importlib.resources as pkg_resources
+from os.path import exists
+import json
+import h2p_parser.pos_parser as pos_parser
+# Method to get data path
+def get_data_path():
+    data_path = pkg_resources.files('h2p_parser.data')
+    if data_path is None:
+        raise FileNotFoundError("Data folder not found")
+    return data_path
+# Dictionary class
+class Dictionary:
+    def __init__(self, file_name=None):
+        # If a file name is not provided, use the default file name
+        self.file_name = file_name
+        if file_name is None:
+            self.file_name = 'dict.json'
+            self.use_default = True
+        else:
+            self.file_name = file_name
+            self.use_default = False
+        self.dictionary = {}
+        self.dictionary = self.load_dictionary(file_name)
+    # Loads the dictionary from the json file
+    def load_dictionary(self, path=None):
+        if path is None:
+            data_path = get_data_path()
+            dict_path = data_path.joinpath(self.file_name)
+            with open(str(dict_path)) as def_file:
+                read_dict = json.load(def_file)
+        else:
+            if not exists(path):
+                raise FileNotFoundError(f'Dictionary {self.file_name} file not found')
+            with open(path) as file:
+                try:
+                    read_dict = json.load(file)
+                except json.decoder.JSONDecodeError:
+                    raise ValueError(f'Dictionary {self.file_name} file is not valid JSON')
+        # Check dictionary has at least one entry
+        if len(read_dict) == 0:
+            raise ValueError('Dictionary is empty or invalid')
+        return read_dict
+    # Check if a word is in the dictionary
+    def contains(self, word):
+        word = word.lower()
+        return word in self.dictionary
+    # Get the phonetic pronunciation of a word using Part of Speech tag
+    def get_phoneme(self, word, pos):
+        # Get the sub-dictionary at dictionary[word]
+        sub_dict = self.dictionary[word.lower()]
+        # First, check if the exact pos is a key
+        if pos in sub_dict:
+            return sub_dict[pos]
+        # If not, use the parent pos of the pos tag
+        parent_pos = pos_parser.get_parent_pos(pos)
+        if parent_pos is not None:
+            # Check if the sub_dict contains the parent pos
+            if parent_pos in sub_dict:
+                return sub_dict[parent_pos]
+        # If not, check if the sub_dict contains a DEFAULT key
+        if 'DEFAULT' in sub_dict:
+            return sub_dict['DEFAULT']
+        # If no matches, return None
+        return None

resources/app/python/xvapitch/text/h2p_parser/filter.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from unicodedata import normalize
+import re
+# Pre-compile regex
+re_filter = re.compile(r"[^ A-Za-z'.,?!()\-]")
+re_filter_with_num = re.compile(r"[^ A-Za-z\d'.,?!()\-]")
+re_multi_space = re.compile(r"\s\s+")
+# Filters text before parsing
+# @param text: text to be filtered
+# @return: filtered text
+def filter_text(text: str, allow_num: bool = False, preserve_case: bool = False) -> str:
+    """
+    Filters text before parsing
+    :param preserve_case:
+    :param allow_num: True if numbers are allowed
+    :param text: Input raw text
+    :return: Text after stripped accents, lower-cased, and invalid punctuation removed
+    """
+    # Strip accents
+    text = normalize('NFD', text)
+    # To lowercase
+    if not preserve_case:
+        text = text.lower()
+    # Remove all invalid punctuation
+    if allow_num:
+        text = re.sub(re_filter_with_num, '', text)
+    else:
+        text = re.sub(re_filter, "", text)
+    # Remove all spaces more than 1
+    text = re.sub(re_multi_space, " ", text)
+    # Return
+    return text

resources/app/python/xvapitch/text/h2p_parser/format_ph.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from typing import overload
+# Converts and outputs various formats of phonemes
+@overload
+def to_sds(ph: str) -> str: ...
+@overload
+def to_sds(ph: list) -> str: ...
+def to_sds(ph: list or str) -> str or None:
+    """
+    Converts phonemes to space delimited string format
+    :param ph: Phoneme as str or list, supports nested lists
+    :return: Phoneme as space delimited string
+    """
+    # Return None if None
+    if ph is None:
+        return None
+    # Return directly if str
+    if isinstance(ph, str):
+        return ph
+    # If is list, convert each element
+    if isinstance(ph, list):
+        # If list empty, return None
+        if len(ph) == 0:
+            return None
+        # Case for further lists
+        if isinstance(ph[0], list):
+            return to_sds(ph[0])  # Recursive call
+        # Case if str at index 0, and size 1, return directly
+        elif isinstance(ph[0], str) and len(ph) == 1:
+            return ph[0]
+        # Case if str at index 0, above size 1, return with join
+        elif isinstance(ph[0], str):
+            return ' '.join(ph)
+        # Case for none
+        elif ph[0] is None:
+            return None
+        else:
+            raise TypeError('to_sds() encountered an unexpected nested element type')
+    # Error if no matches
+    raise TypeError('to_sds() expects a list or string')
+@overload
+def to_list(ph: str) -> list: ...
+@overload
+def to_list(ph: list) -> list: ...
+def to_list(ph: str or list) -> list or None:
+    """
+    Converts phonemes to list format
+    :param ph: Phoneme as str or list, supports nested lists
+    :return: Phoneme as list
+    """
+    # Return None if None
+    if ph is None:
+        return None
+    # Return directly if list and index 0 is str
+    if isinstance(ph, list) and len(ph) > 0 and isinstance(ph[0], str):
+        return ph
+    # If space delimited string, convert to list
+    if isinstance(ph, str):
+        return ph.split(' ')
+    # If nested list, convert each element
+    if isinstance(ph, list):
+        # If list empty or has None, return None
+        if len(ph) == 0 or ph[0] is None:
+            return None
+        # Case for further lists
+        if isinstance(ph[0], list):
+            return to_list(ph[0])  # Recursive call
+    # Error if no matches
+    raise TypeError('to_list() expects a list or string')
+# Surrounds text with curly brackets
+def with_cb(text: str) -> str:
+    """
+    Surrounds text with curly brackets
+    :param text: Text to surround
+    :return: Surrounded text
+    """
+    return '{' + text + '}'

resources/app/python/xvapitch/text/h2p_parser/h2p.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import nltk
+import re
+from nltk.tokenize import TweetTokenizer
+from nltk import pos_tag
+from nltk import pos_tag_sents
+from .dictionary import Dictionary
+from .filter import filter_text as ft
+from . import format_ph as ph
+# Check that the nltk data is downloaded, if not, download it
+try:
+    nltk.data.find('taggers/averaged_perceptron_tagger.zip')
+except LookupError:
+    nltk.download('averaged_perceptron_tagger')
+# Method to use Regex to replace the first instance of a word with its phonemes
+def replace_first(target, replacement, text):
+    # Skip if target invalid
+    if target is None or target == '':
+        return text
+    # Replace the first instance of a word with its phonemes
+    return re.sub(r'(?i)\b' + target + r'\b', replacement, text, 1)
+class H2p:
+    def __init__(self, dict_path=None, preload=False, phoneme_format=''):
+        """
+        Creates a H2p parser
+        Supported phoneme formats:
+            - Space delimited
+            - Space delimited surrounded by { }
+        :param dict_path: Path to a heteronym dictionary json file. Built-in dictionary will be used if None
+        :type dict_path: str
+        :param preload: Preloads the tokenizer and tagger during initialization
+        :type preload: bool
+        """
+        # Supported phoneme formats
+        self.phoneme_format = phoneme_format
+        self.dict = Dictionary(dict_path)
+        self.tokenize = TweetTokenizer().tokenize
+        self.get_tags = pos_tag
+        if preload:
+            self.preload()
+    # Method to preload tokenizer and pos_tag
+    def preload(self):
+        tokens = self.tokenize('a')
+        assert tokens == ['a']
+        assert pos_tag(tokens)[0][0] == 'a'
+    # Method to check if a text line contains a heteronym
+    def contains_het(self, text):
+        # Filter the text
+        text = ft(text)
+        # Tokenize
+        words = self.tokenize(text)
+        # Check match with dictionary
+        hets = []
+        for word in words:
+            if self.dict.contains(word):
+                hets.append(word)
+        return len(hets)>0, hets
+    # Method to replace heteronyms in a text line to phonemes
+    def replace_het(self, text):
+        # Filter the text
+        working_text = ft(text, preserve_case=True)
+        # Tokenize
+        words = self.tokenize(working_text)
+        # Get pos tags
+        tags = pos_tag(words)
+        # Loop through words and pos tags
+        for word, pos in tags:
+            # Skip if word not in dictionary
+            if not self.dict.contains(word):
+                continue
+            # Get phonemes
+            phonemes = self.dict.get_phoneme(word, pos)
+            # Format phonemes
+            f_ph = ph.with_cb(ph.to_sds(phonemes))
+            # Replace word with phonemes
+            text = replace_first(word, f_ph, text)
+        return text
+    # Replaces heteronyms in a list of text lines
+    # Slightly faster than replace_het() called on each line
+    def replace_het_list(self, text_list):
+        # Filter the text
+        working_text_list = [ft(text, preserve_case=True) for text in text_list]
+        # Tokenize
+        list_sentence_words = [self.tokenize(text) for text in working_text_list]
+        # Get pos tags list
+        tags_list = pos_tag_sents(list_sentence_words)
+        # Loop through lines
+        for index in range(len(tags_list)):
+            # Loop through words and pos tags in tags_list index
+            for word, pos in tags_list[index]:
+                # Skip if word not in dictionary
+                if not self.dict.contains(word):
+                    continue
+                # Get phonemes
+                phonemes = self.dict.get_phoneme(word, pos)
+                # Format phonemes
+                f_ph = ph.with_cb(ph.to_sds(phonemes))
+                # Replace word with phonemes
+                text_list[index] = replace_first(word, f_ph, text_list[index])
+        return text_list
+    # Method to tag a text line, returns a list of tags
+    def tag(self, text):
+        # Filter the text
+        working_text = ft(text, preserve_case=True)
+        # Tokenize
+        words = self.tokenize(working_text)
+        # Get pos tags
+        tags = pos_tag(words)
+        # Only return element 1 of each list
+        return [tag[1] for tag in tags]

resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,14 @@

+Metadata-Version: 2.1
+Name: h2p-parser
+Version: 1.0.0
+Summary: Heteronym to Phoneme Parser
+Home-page: https://github.com/ionite34/h2p-parser
+Author: ionite
+Author-email: [email protected]
+License: Apache 2.0
+Platform: UNKNOWN
+Requires-Python: >=3.7
+License-File: LICENSE
+UNKNOWN

resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+LICENSE
+README.md
+setup.py
+h2p_parser/__init__.py
+h2p_parser/__main__.py
+h2p_parser/cmudictext.py
+h2p_parser/dict_reader.py
+h2p_parser/dictionary.py
+h2p_parser/filter.py
+h2p_parser/format_ph.py
+h2p_parser/h2p.py
+h2p_parser/pos_parser.py
+h2p_parser/processors.py
+h2p_parser/symbols.py
+h2p_parser/h2p_parser.egg-info/PKG-INFO
+h2p_parser/h2p_parser.egg-info/SOURCES.txt
+h2p_parser/h2p_parser.egg-info/dependency_links.txt
+h2p_parser/h2p_parser.egg-info/requires.txt
+h2p_parser/h2p_parser.egg-info/top_level.txt

resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ nltk
2	+ inflect

resources/app/python/xvapitch/text/h2p_parser/h2p_parser.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

resources/app/python/xvapitch/text/h2p_parser/pos_parser.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# Part of Speech Tag Operations
+# Method to get the parent part of speech (VERB) or (NOUN) from a pos tag
+# from __future__ import annotations
+# def get_parent_pos(pos: str) -> str | None:
+def get_parent_pos(pos):
+    # Get the parent part of speech from a pos tag
+    if pos.startswith('VB'):
+        return 'VERB'
+    elif pos.startswith('NN'):
+        return 'NOUN'
+    elif pos.startswith('RB'):
+        return 'ADVERB'
+    else:
+        return None

resources/app/python/xvapitch/text/h2p_parser/processors.py ADDED Viewed

	@@ -0,0 +1,392 @@

+# Transformations of text sequences for matching
+from __future__ import annotations
+from typing import TYPE_CHECKING
+from .symbols import consonants
+import re
+if TYPE_CHECKING:
+    from .cmudictext import CMUDictExt
+_re_digit = re.compile(r'\d+')
+class Processor:
+    def __init__(self, cde: CMUDictExt):
+        self._lookup = cde.lookup
+        self._cmu_get = cde.dict.get
+        self._segment = cde.segment
+        self._tag = cde.h2p.tag
+        self._stem = cde.stem
+        # Number of times respective methods were called
+        self.stat_hits = {
+            'plural': 0,
+            'possessives': 0,
+            'contractions': 0,
+            'hyphenated': 0,
+            'compound': 0,
+            'compound_l2': 0,
+            'stem': 0
+        }
+        # Number of times respective methods returned value (not None)
+        self.stat_resolves = {
+            'plural': 0,
+            'possessives': 0,
+            'contractions': 0,
+            'hyphenated': 0,
+            'compound': 0,
+            'compound_l2': 0,
+            'stem': 0
+        }
+        # Holds events when features encountered unexpected language syntax
+        self.stat_unexpected = {
+            'plural': [],
+            'possessives': [],
+            'contractions': [],
+            'hyphenated': [],
+            'compound': [],
+            'compound_l2': [],
+            'stem': []
+        }
+    def auto_possessives(self, word: str) -> str | None:
+        """
+        Auto-possessives
+        :param word: Input of possible possessive word
+        :return: Phoneme of word as SDS, or None if unresolvable
+        """
+        if not word.endswith("'s"):
+            return None
+        # If the word ends with "'s", register a hit
+        self.stat_hits['possessives'] += 1
+        """
+        There are 3 general cases:
+        1. Base words ending in one of 6 special consonants (sibilants)
+            - i.e. Tess's, Rose's, Butch's, Midge's, Rush's, Garage's
+            - With consonants ending of [s], [z], [ch], [j], [sh], [zh]
+            - In ARPAbet: {S}, {Z}, {CH}, {JH}, {SH}, {ZH}
+            - These require a suffix of {IH0 Z}
+        2. Base words ending in vowels and voiced consonants:
+            - i.e. Fay's, Hugh's, Bob's, Ted's, Meg's, Sam's, Dean's, Claire's, Paul's, Bing's
+            - In ARPAbet: {IY0}, {EY1}, {UW1}, {B}, {D}, {G}, {M}, {N}, {R}, {L}, {NG}
+            - Vowels need a wildcard match of any numbered variant
+            - These require a suffix of {Z}
+        3. Base words ending in voiceless consonants:
+            - i.e. Hope's, Pat's, Clark's, Ruth's
+            - In ARPAbet: {P}, {T}, {K}, {TH}
+            - These require a suffix of {S}
+        """
+        # Method to return phoneme and increment stat
+        def _resolve(phoneme: str) -> str:
+            self.stat_resolves['possessives'] += 1
+            return phoneme
+        core = word[:-2]  # Get core word without possessive
+        ph = self._lookup(core, ph_format='list')  # find core word using recursive search
+        if ph is None:
+            return None  # Core word not found
+        # [Case 1]
+        if ph[-1] in {'S', 'Z', 'CH', 'JH', 'SH', 'ZH'}:
+            ph += 'IH0' + 'Z'
+            return _resolve(ph)
+        # [Case 2]
+        """
+        Valid for case 2:
+        'AA', 'AO', 'EY', 'OW', 'UW', 'AE', 'AW', 'EH', 'IH',
+        'OY', 'AH', 'AY', 'ER', 'IY', 'UH', 'UH',
+        'B', 'D', 'G', 'M', 'N', 'R', 'L', 'NG'
+        To simplify matching, we will check for the listed single-letter variants and 'NG'
+        and then check for any numbered variant
+        """
+        if ph[-1] in {'B', 'D', 'G', 'M', 'N', 'R', 'L', 'NG'} or ph[-1][-1].isdigit():
+            ph += 'Z'
+            return _resolve(ph)
+        # [Case 3]
+        if ph[-1] in ['P', 'T', 'K', 'TH']:
+            ph += 'S'
+            return _resolve(ph)
+        return None  # No match found
+    def auto_contractions(self, word: str) -> str | None:
+        """
+        Auto contracts form and finds phonemes
+        :param word:
+        :return:
+        """
+        """
+        Supported contractions:
+        - 'll
+        - 'd
+        """
+        # First, check if the word is a contraction
+        parts = word.split("\'")  # Split on [']
+        if len(parts) == 1 or parts[1] not in {'ll', 'd'}:
+            return None  # No contraction found
+        if len(parts) > 2:
+            self.stat_unexpected['contraction'] += word
+            return None  # More than 2 parts, can't be a contraction
+        # If initial check passes, register a hit
+        self.stat_hits['contractions'] += 1
+        # Get the core word
+        core = parts[0]
+        # Get the phoneme for the core word recursively
+        ph = self._lookup(core, ph_format='list')
+        if ph is None:
+            return None  # Core word not found
+        # Add the phoneme with the appropriate suffix
+        if parts[1] == 'll':
+            ph += 'L'
+        elif parts[1] == 'd':
+            ph += 'D'
+        # Return the phoneme
+        self.stat_resolves['contractions'] += 1
+        return ph
+    def auto_hyphenated(self, word: str) -> str | None:
+        """
+        Splits hyphenated words and attempts to resolve components
+        :param word:
+        :return:
+        """
+        # First, check if the word is a hyphenated word
+        if '-' not in word:
+            return None  # No hyphen found
+        # If initial check passes, register a hit
+        self.stat_hits['hyphenated'] += 1
+        # Split the word into parts
+        parts = word.split('-')
+        # Get the phonemes for each part
+        ph = []
+        for part in parts:
+            ph_part = self._lookup(part, ph_format='sds')
+            if ph_part is None:
+                return None  # Part not found
+            ph.append(ph_part)
+        # Join the phonemes
+        ph = ' '.join(ph)
+        # Return the phoneme
+        self.stat_resolves['hyphenated'] += 1
+        return ph
+    def auto_compound(self, word: str) -> str | None:
+        """
+        Splits compound words and attempts to resolve components
+        :param word:
+        :return:
+        """
+        # Split word into parts
+        parts = self._segment(word)
+        if len(parts) == 1:
+            return None  # No compound found
+        # If initial check passes, register a hit
+        self.stat_hits['compound'] += 1
+        # Get the phonemes for each part
+        ph = []
+        for part in parts:
+            ph_part = self._lookup(part, ph_format='sds')
+            if ph_part is None:
+                return None  # Part not found
+            ph.append(ph_part)
+        # Join the phonemes
+        ph = ' '.join(ph)
+        # Return the phoneme
+        self.stat_resolves['compound'] += 1
+        return ph
+    def auto_plural(self, word: str, pos: str = None) -> str | None:
+        """
+        Finds singular form of plurals and attempts to resolve separately
+        Optionally a pos tag can be provided.
+        If no tags are provided, there will be a single word pos inference,
+        which is not ideal.
+        :param pos:
+        :param word:
+        :return:
+        """
+        # First, check if the word is a replaceable plural
+        # Needs to end in 's' or 'es'
+        if word[-1] != 's':
+            return None  # No plural found
+        # Now check if the word is a plural using pos
+        if pos is None:
+            pos = self._tag(word)
+        if pos is None or len(pos) == 0 or (pos[0] != 'NNS' and pos[0] != 'NNPS'):
+            return None  # No tag found
+        # If initial check passes, register a hit
+        self.stat_hits['plural'] += 1
+        """
+        Case 1:
+        > Word ends in 'oes'
+        > Remove the 'es' to get the singular
+        """
+        if len(word) > 3 and word[-3:] == 'oes':
+            singular = word[:-2]
+            # Look up the possessive form (since the pronunciation is the same)
+            ph = self.auto_possessives(singular + "'s")
+            if ph is not None:
+                self.stat_resolves['plural'] += 1
+                return ph  # Return the phoneme
+        """
+        Case 2:
+        > Word ends in 's'
+        > Remove the 's' to get the singular
+        """
+        if len(word) > 1 and word[-1] == 's':
+            singular = word[:-1]
+            # Look up the possessive form (since the pronunciation is the same)
+            ph = self.auto_possessives(singular + "'s")
+            if ph is not None:
+                self.stat_resolves['plural'] += 1
+                return ph  # Return the phoneme
+        # If no matches, return None
+        return None
+    def auto_stem(self, word: str) -> str | None:
+        """
+        Attempts to resolve using the root stem of a word.
+        Supported modes:
+            - "ing"
+            - "ingly"
+            - "ly"
+        :param word:
+        :return:
+        """
+        # noinspection SpellCheckingInspection
+        """
+        'ly' has no special rules, always add phoneme 'L IY0'
+        'ing' relevant rules:
+        > If the original verb ended in [e], remove it and add [ing]
+            - i.e. take -> taking, make -> making
+            - We will search once with the original verb, and once with [e] added
+                - 1st attempt: tak, mak
+                - 2nd attempt: take, make
+        > If the input word has a repeated consonant before [ing], it's likely that
+        the original verb has only 1 of the consonants
+            - i.e. running -> run, stopping -> stop
+            - We will search for repeated consonants, and perform 2 attempts:
+                - 1st attempt: without the repeated consonant (run, stop)
+                - 2nd attempt: with the repeated consonant (runn, stopp)
+        """
+        # Discontinue if word is too short
+        if len(word) < 3 or (not word.endswith('ly') and not word.endswith('ing')):
+            return None
+        # Register a hit
+        self.stat_hits['stem'] += 1  # Register hit
+        # For ly case
+        if word.endswith('ly'):
+            # Get the root word
+            root = word[:-2]
+            # Recursively get the root
+            ph_root = self._lookup(root, ph_format='sds')
+            # If not exist, return None
+            if ph_root is None:
+                return None
+            ph_ly = 'L IY0'
+            ph_joined = ' '.join([ph_root, ph_ly])
+            self.stat_resolves['stem'] += 1
+            return ph_joined
+        # For ing case
+        if word.endswith('ing'):
+            # Get the root word
+            root = word[:-3]
+            # Recursively get the root
+            ph_root = self._lookup(root, ph_format='sds')
+            # If not exist, return None
+            if ph_root is None:
+                return None
+            ph_ly = 'IH0 NG'
+            ph_joined = ' '.join([ph_root, ph_ly])
+            self.stat_resolves['stem'] += 1
+            return ph_joined
+    def auto_component(self, word: str) -> str | None:
+        """
+        Searches for target word as component of a larger word
+        :param word:
+        :return:
+        """
+        """
+        This processing step checks for words as a component of a larger word
+        - i.e. 'synth' is not in the cmu dictionary
+        - Stage 1: We will search for any word beginning with 'synth' (10 matches)
+            - This is because most unseen short words are likely shortened versions
+            - We will split
+        - Stage 2: Search for any word containing 'synth' (13 matches)
+        """
+        raise NotImplementedError
+    def auto_compound_l2(self, word: str, recursive: bool = True) -> str | None:
+        """
+        Searches for target word as a compound word.
+        > Does not use n-gram splitting like auto_compound()
+        > Splits words manually into every possible combination
+        > Returns the match with the highest length of both words
+        :param recursive: True to enable recursive lookups, otherwise only use base CMU dictionary
+        :param word:
+        :return:
+        """
+        # Word must be fully alphabetic
+        if not word.isalpha() or len(word) < 3:
+            return None
+        self.stat_hits['compound_l2'] += 1  # Register hit
+        # Define lookup mode
+        def _lu(search_word: str) -> str | None:
+            if recursive:
+                return self._lookup(search_word, ph_format='sds')
+            else:
+                return self._cmu_get(search_word)
+        # Check if the last part is a single character
+        # And that it is repeated in the last char of the first part
+        # This is likely silent so remove it
+        # i.e. 'Derakk' -> 'Derak'
+        # If the word contains a repeated consonant at the end, remove it
+        # First check repeated last 2 letters
+        if word[-2:][0] == word[-2:][1]:
+            # Remove the last char from the word
+            word = word[:-1]
+        # Holds all matches as tuples
+        # (len1, len2, p1, p2, ph1, ph2)
+        matches = []
+        # Splits the word into every possible combination
+        for i in range(1, len(word)):
+            p1 = word[:i]
+            p2 = word[i:]
+            # Looks up both words
+            ph1 = _lu(p1)
+            if ph1 is None:
+                continue  # Skip if not found
+            ph2 = _lu(p2)
+            if ph2 is None:
+                continue  # Skip if not found
+            # If both words exist, add to list as tuple
+            matches.append((len(p1), len(p2), p1, p2, ph1, ph2))
+        # Pick the match with the highest length of both words
+        if len(matches) == 0:
+            return None
+        else:
+            # Sort by the minimum of len1 and len2
+            matches.sort(key=lambda x: min(x[0], x[1]))
+            # Get the highest minimum length match
+            match = matches[-1]
+            # Otherwise, return the full joined match
+            self.stat_resolves['compound_l2'] += 1  # Register resolve
+            return match[4] + ' ' + match[5]

resources/app/python/xvapitch/text/h2p_parser/symbols.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Holds symbols for graphemes, phonemes, and pos-tags.
+# noinspection SpellCheckingInspection,GrazieInspection
+"""
+POS tag list:
+CC      coordinating conjunction
+CD	    cardinal digit
+DT	    determiner
+EX	    existential there ("there is" -> "there exists")
+FW	    foreign word
+IN	    preposition/subordinating conjunction
+JJ	    adjective  ('big')
+JJR	    adjective, comparative	('bigger')
+JJS	    adjective, superlative	('biggest')
+LS	    list marker	("1)", "2)", "3)")
+MD	    modal	('could', 'will')
+NN	    noun, singular
+NNS	    noun plural
+NNP	    proper noun, singular	'Harrison'
+NNPS	proper noun, plural	'Americans'
+PDT	    predeterminer	('all' in 'all the kids')
+POS	    possessive ending	(parent's)
+PRP	    personal pronoun	(I, he, she)
+PRP$	possessive pronoun	(my, his, hers)
+RB	    adverb	('very', 'silently')
+RBR     adverb, comparative	('better')
+RBS     adverb, superlative	('best')
+RP      particle	('give up')
+TO      to	("go 'to' the store.")
+UH	    interjection	("errrrrrrrm")
+VB	    verb, base form	take
+VBD	    verb, past tense	took
+VBG	    verb, gerund/present participle	taking
+VBN	    verb, past participle	taken
+VBP	    verb, sing. present, non-3d	take
+VBZ	    verb, 3rd person sing. present	takes
+WDT	    wh-determiner	which
+WP	    wh-pronoun	who, what
+WP$	    possessive wh-pronoun	whose
+WRB	    wh-abverb	where, when
+"""
+from __future__ import annotations
+# noinspection SpellCheckingInspection,GrazieInspection
+graphemes = list("abcdefghijklmnopqrstuvwxyz")
+phonemes = ['AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
+            'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH',
+            'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH',
+            'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG',
+            'OW0', 'OW1', 'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH',
+            'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH']
+pos_tags = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS',
+            'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH',
+            'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']
+pos_type_tags = ['VERB', 'NOUN', 'PRON', 'ADJ', 'ADV']
+pos_type_short_tags = ['V', 'N', 'P', 'A', 'R']
+pos_type_form_dict = {'V': 'VERB', 'N': 'NOUN', 'P': 'PRON', 'A': 'ADJ', 'R': 'ADV'}
+graphemes_set = set(graphemes)
+phonemes_set = set(phonemes)
+pos_tags_set = set(pos_tags)
+pos_type_tags_set = set(pos_type_tags)
+pos_type_short_tags_set = set(pos_type_short_tags)
+punctuation = {'.', ',', ':', ';', '?', '!', '-', '_', '\'', '\"', '`', '~', '@', '#', '$'}
+consonants = {'B', 'CH', 'D', 'DH', 'F', 'G', 'HH', 'JH', 'K', 'L', 'M', 'N', 'NG', 'P', 'R',
+              'S', 'SH', 'T', 'TH', 'V', 'W', 'Y', 'Z', 'ZH'}
+# Method to convert from short type tags to full type tags.
+def to_full_type_tag(short_type_tag: str) -> str | None:
+    if short_type_tag == 'V':
+        return 'VERB'
+    elif short_type_tag == 'N':
+        return 'NOUN'
+    elif short_type_tag == 'P':
+        return 'PRON'
+    elif short_type_tag == 'A':
+        return 'ADJ'
+    elif short_type_tag == 'R':
+        return 'ADV'
+    else:
+        return None

resources/app/python/xvapitch/text/h2p_parser/text/__init__.py ADDED Viewed

File without changes

resources/app/python/xvapitch/text/h2p_parser/text/numbers.py ADDED Viewed

	@@ -0,0 +1,166 @@

+# Provides parsing of numbers to text
+"""
+This module provides parsing of numeric types in English to text.
+Modified from https://github.com/keithito/tacotron
+"""
+import inflect
+import re
+_magnitudes = ['trillion', 'billion', 'million', 'thousand', 'hundred', 'm', 'b', 't']
+_magnitudes_key = {'m': 'million', 'b': 'billion', 't': 'trillion'}
+_measurements = '(f|c|k|d|m|km|ft)'
+_measurements_key = {'f': 'fahrenheit',
+                     'c': 'celsius',
+                     'k': 'thousand',
+                     'm': 'meters',
+                     'km': 'kilometers',
+                     'ft': 'feet'}
+_currency_key = {'$': 'dollar', '£': 'pound', '€': 'euro', '₩': 'won'}
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r'([0-9][0-9,]+[0-9])')
+_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
+_currency_re = re.compile(r'([$€£₩])([0-9.,]*[0-9]+)(?:[ ]?({})(?=[^a-zA-Z]|$))?'.format("|".join(_magnitudes)),
+                          re.IGNORECASE)
+_measurement_re = re.compile(r'([0-9.,]*[0-9]+(\s)?{}\b)'.format(_measurements), re.IGNORECASE)
+_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
+_range_re = re.compile(r'(?<=[0-9])+(-)(?=[0-9])+.*?')
+_roman_re = re.compile(r'\b(?=[MDCLXVI]+\b)M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{2,3})\b')  # avoid I
+_multiply_re = re.compile(r'(\b[0-9]+)(x)([0-9]+)')
+_number_re = re.compile(r"[0-9]+'s|[0-9]+s|[0-9]+")
+def _remove_commas(m):
+    return m.group(1).replace(',', '')
+def _expand_decimal_point(m):
+    return m.group(1).replace('.', ' point ')
+def _expand_currency(m):
+    currency = _currency_key[m.group(1)]
+    quantity = m.group(2)
+    magnitude = m.group(3)
+    # remove commas from quantity to be able to convert to numerical
+    quantity = quantity.replace(',', '')
+    # check for million, billion, etc...
+    if magnitude is not None and magnitude.lower() in _magnitudes:
+        if len(magnitude) == 1:
+            magnitude = _magnitudes_key[magnitude.lower()]
+        return "{} {} {}".format(_expand_hundreds(quantity), magnitude, currency + 's')
+    parts = quantity.split('.')
+    if len(parts) > 2:
+        return quantity + " " + currency + "s"  # Unexpected format
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = currency if dollars == 1 else currency + 's'
+        cent_unit = 'cent' if cents == 1 else 'cents'
+        return "{} {}, {} {}".format(
+            _expand_hundreds(dollars), dollar_unit,
+            _inflect.number_to_words(cents), cent_unit)
+    elif dollars:
+        dollar_unit = currency if dollars == 1 else currency + 's'
+        return "{} {}".format(_expand_hundreds(dollars), dollar_unit)
+    elif cents:
+        cent_unit = 'cent' if cents == 1 else 'cents'
+        return "{} {}".format(_inflect.number_to_words(cents), cent_unit)
+    else:
+        return 'zero' + ' ' + currency + 's'
+def _expand_hundreds(text):
+    number = float(text)
+    if 1000 < number < 10000 and (number % 100 == 0) and (number % 1000 != 0):
+        return _inflect.number_to_words(int(number / 100)) + " hundred"
+    else:
+        return _inflect.number_to_words(text)
+def _expand_ordinal(m):
+    return _inflect.number_to_words(m.group(0))
+def _expand_measurement(m):
+    _, number, measurement = re.split(r'(\d+(?:\.\d+)?)', m.group(0))
+    number = _inflect.number_to_words(number)
+    measurement = "".join(measurement.split())
+    measurement = _measurements_key[measurement.lower()]
+    # if measurement is plural, and number is singular, remove the 's'
+    if number == "one" and str.endswith(measurement, "s"):
+        # Remove the 's' from the end of the measurement
+        measurement = measurement[:-1]
+    return "{} {}".format(number, measurement)
+def _expand_range(m):
+    return ' to '
+def _expand_multiply(m):
+    left = m.group(1)
+    right = m.group(3)
+    return "{} by {}".format(left, right)
+def _expand_roman(m):
+    # from https://stackoverflow.com/questions/19308177/converting-roman-numerals-to-integers-in-python
+    roman_numerals = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
+    result = 0
+    num = m.group(0)
+    for i, c in enumerate(num):
+        if (i + 1) == len(num) or roman_numerals[c] >= roman_numerals[num[i + 1]]:
+            result += roman_numerals[c]
+        else:
+            result -= roman_numerals[c]
+    return str(result)
+def _expand_number(m):
+    _, number, suffix = re.split(r"(\d+(?:'?\d+)?)", m.group(0))
+    number = int(number)
+    if number > 1000 < 10000 and (number % 100 == 0) and (number % 1000 != 0):
+        text = _inflect.number_to_words(number // 100) + " hundred"
+    elif 1000 < number < 3000:
+        if number == 2000:
+            text = 'two thousand'
+        elif 2000 < number < 2010:
+            text = 'two thousand ' + _inflect.number_to_words(number % 100)
+        elif number % 100 == 0:
+            text = _inflect.number_to_words(number // 100) + ' hundred'
+        else:
+            number = _inflect.number_to_words(number, andword='', zero='oh', group=2).replace(', ', ' ')
+            number = re.sub(r'-', ' ', number)
+            text = number
+    else:
+        number = _inflect.number_to_words(number, andword='and')
+        number = re.sub(r'-', ' ', number)
+        number = re.sub(r',', '', number)
+        text = number
+    if suffix in ("'s", "s"):
+        if text[-1] == 'y':
+            text = text[:-1] + 'ies'
+        else:
+            text = text + suffix
+    return text
+def normalize_numbers(text):
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_currency_re, _expand_currency, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    # text = re.sub(_range_re, _expand_range, text)
+    text = re.sub(_measurement_re, _expand_measurement, text)
+    text = re.sub(_roman_re, _expand_roman, text)
+    text = re.sub(_multiply_re, _expand_multiply, text)
+    text = re.sub(_number_re, _expand_number, text)
+    return text

resources/app/python/xvapitch/text/h2p_parser/utils/__init__.py ADDED Viewed

File without changes

resources/app/python/xvapitch/text/h2p_parser/utils/converter.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Converts dictionary files
+import json
+import os
+from .. import symbols
+from .. import format_ph as ph
+from tqdm import tqdm
+def from_binary_delim(path, delimiter) -> dict:
+    # Converts a delimited binary state heteronym look-up dictionary to a dict format
+    # Expected format: WORD|(Space Seperated Phonemes Case)|(Space Seperated Phonemes Default)|(Case)
+    # Example: "REJECT|R IH0 JH EH1 K T|R IY1 JH EH0 K T|V"
+    # Hashtag comments are allowed but only at the start of a file
+    # Import file
+    result_dict = {}
+    num_lines = sum(1 for line in open(path, 'r'))
+    with open(path, 'r') as f:
+        skipped_comments = False
+        for line in tqdm(f, total=num_lines):
+            # Skip comments
+            if not skipped_comments:
+                if line.startswith('#') or line == '\n':
+                    continue
+                else:
+                    skipped_comments = True
+            # Skip empty or newline lines
+            if line.strip() == '' or line.strip() == '\n':
+                continue
+            # Parse line using passed delimiter
+            tokens = line.strip().split(delimiter)
+            # Check for correct number of tokens
+            if len(tokens) != 4:
+                raise ValueError('Invalid number of tokens in line: ' + line)
+            # Get word (token 0) and check validity (no spaces)
+            word = tokens[0].lower()
+            if ' ' in word:
+                raise ValueError('Invalid word in line: ' + line)
+            # Get phonemes and check validity (alphanumeric)
+            ph_case = tokens[1]
+            ph_default = tokens[2]
+            if not ph_case.replace(' ', '').isalnum() or not ph_default.replace(' ', '').isalnum():
+                raise ValueError('Invalid phonemes in line: ' + line)
+            # Get case (token 3) and check validity (alphanumeric)
+            case = tokens[3]
+            if not case.isalnum():
+                raise ValueError('Invalid case in line: ' + line)
+            # Check if case is a full case or full type case
+            if case in symbols.pos_tags_set or case in symbols.pos_type_tags_set:
+                # Add to dictionary directly
+                # Build sub-dictionary for each case
+                sub_dict = result_dict.get(word, {})
+                sub_dict[case] = ph.to_sds(ph_case)
+                sub_dict['DEFAULT'] = ph.to_sds(ph_default)
+                result_dict[word] = sub_dict
+            # Check if case is a short type case
+            elif case in symbols.pos_type_short_tags_set:
+                # Need to convert to full type case
+                sub_dict = result_dict.get(word, {})
+                case_short = symbols.pos_type_form_dict[case]
+                sub_dict[case_short] = ph.to_sds(ph_case)
+                sub_dict['DEFAULT'] = ph.to_sds(ph_default)
+                result_dict[word] = sub_dict
+            else:
+                raise ValueError('Invalid case in line: ' + line)
+    return result_dict
+# Method to write a dict to a json file
+def to_json(path, dict_to_write):
+    # Writes a dictionary to a json file
+    with open(path, 'w') as f:
+        json.dump(dict_to_write, f, indent=4, sort_keys=True)
+# Combined method to convert binary delimited files to json
+def bin_delim_to_json(path, output_path, delimiter):
+    to_json(output_path, from_binary_delim(path, delimiter))

resources/app/python/xvapitch/text/h2p_parser/utils/parser.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# Parses annotation files for conversion of sentences to phonemes
+from __future__ import annotations
+from h2p_parser import cmudictext
+from h2p_parser.filter import filter_text
+from h2p_parser.text.numbers import normalize_numbers
+from h2p_parser.symbols import punctuation
+# Reads a file into a list of lines
+from tqdm import tqdm
+def read_file(file_name, delimiter) -> list:
+    with open(file_name, 'r', encoding="utf-8") as f:
+        result = []
+        for line in f:
+            line = line.split(delimiter)
+            # Take the second element
+            result.append(line[1].lower())
+        return result
+# Method that checks if a single line is resolvable
+# Checks a list of lines for unresolvable words
+# Returns a list of lines with unresolvable words, or None if no unresolvable words
+def check_lines(lines: list) -> ParseResult:
+    cde = cmudictext.CMUDictExt()
+    # Holds result
+    result = ParseResult()
+    # Loop with nqdm
+    for line in tqdm(lines, desc='Checking lines'):
+        # Add
+        result.all_lines.append(line)
+        result.lines.add(line)
+        # If line contains het, add to result
+        if cde.h2p.contains_het(line):
+            result.all_lines_cont_het.append(line)
+        # Filter the line
+        f_line = filter_text(line)
+        # Number converter
+        f_line = normalize_numbers(f_line)
+        # Tokenize
+        tokens = cde.h2p.tokenize(f_line)
+        for word in tokens:
+            # Skip word if punctuation
+            if word in punctuation:
+                continue
+            # Add word to result
+            result.all_words.append(word)
+            result.words.add(word)
+            # Check if word is resolvable
+            h2p_res = cde.h2p.contains_het(word)
+            cmu_res = cde.dict.get(word) is not None
+            fet_res = cde.lookup(word) is not None
+            if not h2p_res and not cmu_res and not fet_res:
+                # If word ends in "'s", remove it and add the base word
+                if word.endswith("'s"):
+                    word = word[:-2]
+                result.unres_all_lines.append(line)
+                result.unres_all_words.append(word)
+                result.unres_lines.add(line)
+                result.unres_words.add(word)
+            elif h2p_res:
+                result.n_words_res += 1
+                result.n_words_het += 1
+            elif cmu_res:
+                result.n_words_res += 1
+                result.n_words_cmu += 1
+            elif fet_res:
+                result.n_words_res += 1
+                result.n_words_fet += 1
+    # Also pass stats
+    result.ft_stats = cde.p.stat_resolves
+    return result
+# Class to hold the result of a parse
+class ParseResult:
+    def __init__(self):
+        self.all_lines = []
+        self.all_lines_cont_het = []
+        self.unres_all_lines = []
+        self.lines = set()
+        self.unres_lines = set()
+        # Words
+        self.all_words = []
+        self.unres_all_words = []
+        self.words = set()
+        self.unres_words = set()
+        # Numerical stats
+        self.n_words_res = 0  # Number of total resolved words
+        self.n_words_cmu = 0  # Resolved words from CMU
+        self.n_words_fet = 0  # Resolved words from Features
+        self.n_words_het = 0  # Resolved words from H2p
+        # Stats from cmudictext
+        self.ft_stats = None
+    # Get percentage of lines covered
+    def line_unique_coverage(self) -> float:
+        dec = 1 - len(self.unres_lines) / len(self.lines)
+        return round(dec * 100, 2)
+    # Get percentage of words covered
+    def word_unique_coverage(self) -> float:
+        dec = 1 - len(self.unres_words) / len(self.words)
+        return round(dec * 100, 2)
+    # Get percentage of lines covered (All)
+    def line_coverage(self) -> float:
+        dec = 1 - len(self.unres_all_lines) / len(self.all_lines)
+        return round(dec * 100, 2)
+    # Get percentage of words covered (All)
+    def word_coverage(self) -> float:
+        dec = 1 - len(self.unres_all_words) / len(self.all_words)
+        return round(dec * 100, 2)
+    # Get percentage of heteronyms containing lines
+    def percent_line_het(self) -> float:
+        dec = len(self.all_lines_cont_het) / len(self.all_lines)
+        return round(dec * 100, 2)
+    # Get percentage of words resolved by H2p
+    def percent_word_h2p(self) -> float:
+        dec = self.n_words_het / self.n_words_res
+        return round(dec * 100, 2)
+    # Get percentage of words resolved by CMU
+    def percent_word_cmu(self) -> float:
+        dec = self.n_words_cmu / self.n_words_res
+        return round(dec * 100, 2)