Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
# Parses annotation files for conversion of sentences to phonemes | |
from __future__ import annotations | |
from h2p_parser import cmudictext | |
from h2p_parser.filter import filter_text | |
from h2p_parser.text.numbers import normalize_numbers | |
from h2p_parser.symbols import punctuation | |
# Reads a file into a list of lines | |
from tqdm import tqdm | |
def read_file(file_name, delimiter) -> list: | |
with open(file_name, 'r', encoding="utf-8") as f: | |
result = [] | |
for line in f: | |
line = line.split(delimiter) | |
# Take the second element | |
result.append(line[1].lower()) | |
return result | |
# Method that checks if a single line is resolvable | |
# Checks a list of lines for unresolvable words | |
# Returns a list of lines with unresolvable words, or None if no unresolvable words | |
def check_lines(lines: list) -> ParseResult: | |
cde = cmudictext.CMUDictExt() | |
# Holds result | |
result = ParseResult() | |
# Loop with nqdm | |
for line in tqdm(lines, desc='Checking lines'): | |
# Add | |
result.all_lines.append(line) | |
result.lines.add(line) | |
# If line contains het, add to result | |
if cde.h2p.contains_het(line): | |
result.all_lines_cont_het.append(line) | |
# Filter the line | |
f_line = filter_text(line) | |
# Number converter | |
f_line = normalize_numbers(f_line) | |
# Tokenize | |
tokens = cde.h2p.tokenize(f_line) | |
for word in tokens: | |
# Skip word if punctuation | |
if word in punctuation: | |
continue | |
# Add word to result | |
result.all_words.append(word) | |
result.words.add(word) | |
# Check if word is resolvable | |
h2p_res = cde.h2p.contains_het(word) | |
cmu_res = cde.dict.get(word) is not None | |
fet_res = cde.lookup(word) is not None | |
if not h2p_res and not cmu_res and not fet_res: | |
# If word ends in "'s", remove it and add the base word | |
if word.endswith("'s"): | |
word = word[:-2] | |
result.unres_all_lines.append(line) | |
result.unres_all_words.append(word) | |
result.unres_lines.add(line) | |
result.unres_words.add(word) | |
elif h2p_res: | |
result.n_words_res += 1 | |
result.n_words_het += 1 | |
elif cmu_res: | |
result.n_words_res += 1 | |
result.n_words_cmu += 1 | |
elif fet_res: | |
result.n_words_res += 1 | |
result.n_words_fet += 1 | |
# Also pass stats | |
result.ft_stats = cde.p.stat_resolves | |
return result | |
# Class to hold the result of a parse | |
class ParseResult: | |
def __init__(self): | |
self.all_lines = [] | |
self.all_lines_cont_het = [] | |
self.unres_all_lines = [] | |
self.lines = set() | |
self.unres_lines = set() | |
# Words | |
self.all_words = [] | |
self.unres_all_words = [] | |
self.words = set() | |
self.unres_words = set() | |
# Numerical stats | |
self.n_words_res = 0 # Number of total resolved words | |
self.n_words_cmu = 0 # Resolved words from CMU | |
self.n_words_fet = 0 # Resolved words from Features | |
self.n_words_het = 0 # Resolved words from H2p | |
# Stats from cmudictext | |
self.ft_stats = None | |
# Get percentage of lines covered | |
def line_unique_coverage(self) -> float: | |
dec = 1 - len(self.unres_lines) / len(self.lines) | |
return round(dec * 100, 2) | |
# Get percentage of words covered | |
def word_unique_coverage(self) -> float: | |
dec = 1 - len(self.unres_words) / len(self.words) | |
return round(dec * 100, 2) | |
# Get percentage of lines covered (All) | |
def line_coverage(self) -> float: | |
dec = 1 - len(self.unres_all_lines) / len(self.all_lines) | |
return round(dec * 100, 2) | |
# Get percentage of words covered (All) | |
def word_coverage(self) -> float: | |
dec = 1 - len(self.unres_all_words) / len(self.all_words) | |
return round(dec * 100, 2) | |
# Get percentage of heteronyms containing lines | |
def percent_line_het(self) -> float: | |
dec = len(self.all_lines_cont_het) / len(self.all_lines) | |
return round(dec * 100, 2) | |
# Get percentage of words resolved by H2p | |
def percent_word_h2p(self) -> float: | |
dec = self.n_words_het / self.n_words_res | |
return round(dec * 100, 2) | |
# Get percentage of words resolved by CMU | |
def percent_word_cmu(self) -> float: | |
dec = self.n_words_cmu / self.n_words_res | |
return round(dec * 100, 2) | |