Spaces:
Running
Running
File size: 7,665 Bytes
297e244 6f27821 297e244 cbf5820 a4107b1 175fc0f 297e244 175fc0f 297e244 daad6da f138a14 6f27821 daad6da 297e244 a4107b1 297e244 a4107b1 297e244 a4107b1 297e244 a4107b1 daad6da 297e244 daad6da a4107b1 297e244 56eee40 6f27821 a4107b1 6f27821 78e8beb a4107b1 daad6da 297e244 7d96516 1be831a f138a14 297e244 daad6da 297e244 daad6da 297e244 8f3fe0f 297e244 daad6da 297e244 a4107b1 daad6da 6f27821 daad6da 6f27821 daad6da 8f3fe0f daad6da f138a14 6f27821 f138a14 6f27821 daad6da 6f27821 f138a14 6f27821 297e244 4cba436 daad6da 297e244 a4107b1 cbf5820 a4107b1 cbf5820 a4107b1 cbf5820 daad6da a4107b1 297e244 4cba436 297e244 a4107b1 297e244 daad6da 297e244 8f3fe0f 297e244 daad6da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 |
import os
import tempfile
import re
import librosa
import torch
import json
import numpy as np
from transformers import Wav2Vec2ForCTC, AutoProcessor
from huggingface_hub import hf_hub_download
from torchaudio.models.decoder import ctc_decoder
from utils.text_norm import text_normalize
from utils.lm import create_unigram_lm, maybe_generate_pseudo_bigram_arpa
uroman_dir = "uroman"
assert os.path.exists(uroman_dir)
UROMAN_PL = os.path.join(uroman_dir, "bin", "uroman.pl")
ASR_SAMPLING_RATE = 16_000
WORD_SCORE_DEFAULT_IF_LM = -0.18
WORD_SCORE_DEFAULT_IF_NOLM = -3.5
LM_SCORE_DEFAULT = 1.48
MODEL_ID = "mms-meta/mms-zeroshot-300m"
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
token_file = hf_hub_download(
repo_id=MODEL_ID,
filename="tokens.txt",
)
class MY_LOG:
def __init__(self):
self.text = "[START]"
def add(self, new_log, new_line=True):
self.text = self.text + ("\n" if new_line else " ") + new_log
self.text = self.text.strip()
return self.text
def error_check_file(filepath):
if not isinstance(filepath, str):
return "Expected file to be of type 'str'. Instead got {}".format(
type(filepath)
)
if not os.path.exists(filepath):
return "Input file '{}' doesn't exists".format(type(filepath))
def norm_uroman(text):
text = text.lower()
text = text.replace("’", "'")
text = re.sub("([^a-z' ])", " ", text)
text = re.sub(" +", " ", text)
return text.strip()
def uromanize(words):
iso = "xxx"
with tempfile.NamedTemporaryFile() as tf, tempfile.NamedTemporaryFile() as tf2:
with open(tf.name, "w") as f:
f.write("\n".join(words))
cmd = f"perl " + UROMAN_PL
cmd += f" -l {iso} "
cmd += f" < {tf.name} > {tf2.name}"
os.system(cmd)
lexicon = {}
with open(tf2.name) as f:
for idx, line in enumerate(f):
if not line.strip():
continue
line = re.sub(r"\s+", "", norm_uroman(line)).strip()
lexicon[words[idx]] = " ".join(line) + " |"
return lexicon
def filter_lexicon(lexicon, word_counts):
spelling_to_words = {}
for w, s in lexicon.items():
spelling_to_words.setdefault(s, [])
spelling_to_words[s].append(w)
lexicon = {}
for s, ws in spelling_to_words.items():
if len(ws) > 1:
# use the word which has higest counts, fewed additional characters
ws.sort(key=lambda w: (-word_counts[w], len(w)))
lexicon[ws[0]] = s
return lexicon
def load_words(filepath):
words = {}
with open(filepath) as f:
lines = f.readlines()
num_sentences = len(lines)
all_sentences = " ".join([l.strip() for l in lines])
norm_all_sentences = text_normalize(all_sentences)
for w in norm_all_sentences.split():
words.setdefault(w, 0)
words[w] += 1
return words, num_sentences
def process(
audio_data,
words_file,
lm_path=None,
wscore=None,
lmscore=None,
wscore_usedefault=True,
lmscore_usedefault=True,
autolm=True,
reference=None,
):
transcription, logs = "", MY_LOG()
if not audio_data or not words_file:
yield "ERROR: Empty audio data or words file", logs.text
return
if isinstance(audio_data, tuple):
# microphone
sr, audio_samples = audio_data
audio_samples = (audio_samples / 32768.0).astype(float)
if sr != ASR_SAMPLING_RATE:
audio_samples = librosa.resample(
audio_samples, orig_sr=sr, target_sr=ASR_SAMPLING_RATE
)
else:
# file upload
assert isinstance(audio_data, str)
audio_samples = librosa.load(audio_data, sr=ASR_SAMPLING_RATE, mono=True)[0]
yield transcription, logs.add(f"Number of audio samples: {len(audio_samples)}")
inputs = processor(
audio_samples, sampling_rate=ASR_SAMPLING_RATE, return_tensors="pt"
)
# set device
if torch.cuda.is_available():
device = torch.device("cuda")
elif (
hasattr(torch.backends, "mps")
and torch.backends.mps.is_available()
and torch.backends.mps.is_built()
):
device = torch.device("mps")
else:
device = torch.device("cpu")
#device = torch.device("cpu")
model.to(device)
inputs = inputs.to(device)
yield transcription, logs.add(f"Using device: {device}")
with torch.no_grad():
outputs = model(**inputs).logits
# Setup lexicon and decoder
yield transcription, logs.add(f"Loading words....")
try:
word_counts, num_sentences = load_words(words_file)
except Exception as e:
yield f"ERROR: Loading words failed '{str(e)}'", logs.text
return
yield transcription, logs.add(
f"Loaded {len(word_counts)} words from {num_sentences} lines.\nPreparing lexicon...."
)
try:
lexicon = uromanize(list(word_counts.keys()))
except Exception as e:
yield f"ERROR: Creating lexicon failed '{str(e)}'", logs.text
return
yield transcription, logs.add(f"Leixcon size: {len(lexicon)}")
# Input could be sentences OR list of words. Check if atleast one word has a count > 1 to diffentiate
tmp_file = tempfile.NamedTemporaryFile() # could be used for LM
if autolm and any([cnt > 2 for cnt in word_counts.values()]):
yield transcription, logs.add(f"Creating unigram LM...", False)
lm_path = tmp_file.name
create_unigram_lm(word_counts, num_sentences, lm_path)
yield transcription, logs.add(f"OK")
if lm_path is None:
yield transcription, logs.add(f"Filtering lexicon....")
lexicon = filter_lexicon(lexicon, word_counts)
yield transcription, logs.add(
f"Ok. Leixcon size after filtering: {len(lexicon)}"
)
else:
# kenlm throws an error if unigram LM is being used
# HACK: generate a bigram LM from unigram LM and a dummy bigram to trick it
maybe_generate_pseudo_bigram_arpa(lm_path)
with tempfile.NamedTemporaryFile() as lexicon_file:
if lm_path is not None and not lm_path.strip():
lm_path = None
with open(lexicon_file.name, "w") as f:
idx = 10
for word, spelling in lexicon.items():
f.write(word + " " + spelling + "\n")
idx += 1
if wscore_usedefault:
wscore = (
WORD_SCORE_DEFAULT_IF_LM
if lm_path is not None
else WORD_SCORE_DEFAULT_IF_NOLM
)
if lmscore_usedefault:
lmscore = LM_SCORE_DEFAULT if lm_path is not None else 0
yield transcription, logs.add(
f"Using word score: {wscore}\nUsing lm score: {lmscore}"
)
beam_search_decoder = ctc_decoder(
lexicon=lexicon_file.name,
tokens=token_file,
lm=lm_path,
nbest=1,
beam_size=500,
beam_size_token=50,
lm_weight=lmscore,
word_score=wscore,
sil_score=0,
blank_token="<s>",
)
beam_search_result = beam_search_decoder(outputs.to("cpu"))
transcription = " ".join(beam_search_result[0][0].words).strip()
yield transcription, logs.add(f"[DONE]")
# for i in process("upload/english/english.mp3", "upload/english/c4_5k_sentences.txt"):
# print(i)
# for i in process("upload/ligurian/ligurian_1.mp3", "upload/ligurian/zenamt_5k_sentences.txt"):
# print(i)
|