Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 4,620 Bytes
2080fde |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
# Parses annotation files for conversion of sentences to phonemes
from __future__ import annotations
from h2p_parser import cmudictext
from h2p_parser.filter import filter_text
from h2p_parser.text.numbers import normalize_numbers
from h2p_parser.symbols import punctuation
# Reads a file into a list of lines
from tqdm import tqdm
def read_file(file_name, delimiter) -> list:
with open(file_name, 'r', encoding="utf-8") as f:
result = []
for line in f:
line = line.split(delimiter)
# Take the second element
result.append(line[1].lower())
return result
# Method that checks if a single line is resolvable
# Checks a list of lines for unresolvable words
# Returns a list of lines with unresolvable words, or None if no unresolvable words
def check_lines(lines: list) -> ParseResult:
cde = cmudictext.CMUDictExt()
# Holds result
result = ParseResult()
# Loop with nqdm
for line in tqdm(lines, desc='Checking lines'):
# Add
result.all_lines.append(line)
result.lines.add(line)
# If line contains het, add to result
if cde.h2p.contains_het(line):
result.all_lines_cont_het.append(line)
# Filter the line
f_line = filter_text(line)
# Number converter
f_line = normalize_numbers(f_line)
# Tokenize
tokens = cde.h2p.tokenize(f_line)
for word in tokens:
# Skip word if punctuation
if word in punctuation:
continue
# Add word to result
result.all_words.append(word)
result.words.add(word)
# Check if word is resolvable
h2p_res = cde.h2p.contains_het(word)
cmu_res = cde.dict.get(word) is not None
fet_res = cde.lookup(word) is not None
if not h2p_res and not cmu_res and not fet_res:
# If word ends in "'s", remove it and add the base word
if word.endswith("'s"):
word = word[:-2]
result.unres_all_lines.append(line)
result.unres_all_words.append(word)
result.unres_lines.add(line)
result.unres_words.add(word)
elif h2p_res:
result.n_words_res += 1
result.n_words_het += 1
elif cmu_res:
result.n_words_res += 1
result.n_words_cmu += 1
elif fet_res:
result.n_words_res += 1
result.n_words_fet += 1
# Also pass stats
result.ft_stats = cde.p.stat_resolves
return result
# Class to hold the result of a parse
class ParseResult:
def __init__(self):
self.all_lines = []
self.all_lines_cont_het = []
self.unres_all_lines = []
self.lines = set()
self.unres_lines = set()
# Words
self.all_words = []
self.unres_all_words = []
self.words = set()
self.unres_words = set()
# Numerical stats
self.n_words_res = 0 # Number of total resolved words
self.n_words_cmu = 0 # Resolved words from CMU
self.n_words_fet = 0 # Resolved words from Features
self.n_words_het = 0 # Resolved words from H2p
# Stats from cmudictext
self.ft_stats = None
# Get percentage of lines covered
def line_unique_coverage(self) -> float:
dec = 1 - len(self.unres_lines) / len(self.lines)
return round(dec * 100, 2)
# Get percentage of words covered
def word_unique_coverage(self) -> float:
dec = 1 - len(self.unres_words) / len(self.words)
return round(dec * 100, 2)
# Get percentage of lines covered (All)
def line_coverage(self) -> float:
dec = 1 - len(self.unres_all_lines) / len(self.all_lines)
return round(dec * 100, 2)
# Get percentage of words covered (All)
def word_coverage(self) -> float:
dec = 1 - len(self.unres_all_words) / len(self.all_words)
return round(dec * 100, 2)
# Get percentage of heteronyms containing lines
def percent_line_het(self) -> float:
dec = len(self.all_lines_cont_het) / len(self.all_lines)
return round(dec * 100, 2)
# Get percentage of words resolved by H2p
def percent_word_h2p(self) -> float:
dec = self.n_words_het / self.n_words_res
return round(dec * 100, 2)
# Get percentage of words resolved by CMU
def percent_word_cmu(self) -> float:
dec = self.n_words_cmu / self.n_words_res
return round(dec * 100, 2)
|