import gradio as gr
import os
import torch
import librosa
from glob import glob
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline, AutoModelForTokenClassification, TokenClassificationPipeline, Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2ProcessorWithLM
SAMPLE_RATE = 16_000
models = {}
models_paths = {
"en-US": "jonatasgrosman/wav2vec2-large-xlsr-53-english",
"fr-FR": "jonatasgrosman/wav2vec2-large-xlsr-53-french",
"nl-NL": "jonatasgrosman/wav2vec2-large-xlsr-53-dutch",
"pl-PL": "jonatasgrosman/wav2vec2-large-xlsr-53-polish",
"it-IT": "jonatasgrosman/wav2vec2-large-xlsr-53-italian",
"ru-RU": "jonatasgrosman/wav2vec2-large-xlsr-53-russian",
"pt-PT": "jonatasgrosman/wav2vec2-large-xlsr-53-portuguese",
"de-DE": "jonatasgrosman/wav2vec2-large-xlsr-53-german",
"es-ES": "jonatasgrosman/wav2vec2-large-xlsr-53-spanish",
"ja-JP": "jonatasgrosman/wav2vec2-large-xlsr-53-japanese",
"ar-SA": "jonatasgrosman/wav2vec2-large-xlsr-53-arabic",
"fi-FI": "jonatasgrosman/wav2vec2-large-xlsr-53-finnish",
"hu-HU": "jonatasgrosman/wav2vec2-large-xlsr-53-hungarian",
"zh-CN": "jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn",
"el-GR": "jonatasgrosman/wav2vec2-large-xlsr-53-greek",
}
# Classifier Intent
model_name = 'qanastek/XLMRoberta-Alexa-Intents-Classification'
tokenizer_intent = AutoTokenizer.from_pretrained(model_name)
model_intent = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier_intent = TextClassificationPipeline(model=model_intent, tokenizer=tokenizer_intent)
# Classifier Language
model_name = 'qanastek/51-languages-classifier'
tokenizer_langs = AutoTokenizer.from_pretrained(model_name)
model_langs = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier_language = TextClassificationPipeline(model=model_langs, tokenizer=tokenizer_langs)
# NER Extractor
model_name = 'qanastek/XLMRoberta-Alexa-Intents-NER-NLU'
tokenizer_ner = AutoTokenizer.from_pretrained(model_name)
model_ner = AutoModelForTokenClassification.from_pretrained(model_name)
predict_ner = TokenClassificationPipeline(model=model_ner, tokenizer=tokenizer_ner)
EXAMPLE_DIR = './wavs/'
examples = sorted(glob(os.path.join(EXAMPLE_DIR, '*.wav')))
examples = [[e, e.split("=")[0].split("/")[-1]] for e in examples]
def transcribe(audio_path, lang_code):
speech_array, sampling_rate = librosa.load(audio_path, sr=16_000)
if lang_code not in models:
models[lang_code] = {}
models[lang_code]["processor"] = Wav2Vec2Processor.from_pretrained(models_paths[lang_code])
models[lang_code]["model"] = Wav2Vec2ForCTC.from_pretrained(models_paths[lang_code])
# Load model
processor_asr = models[lang_code]["processor"]
model_asr = models[lang_code]["model"]
inputs = processor_asr(speech_array, sampling_rate=16_000, return_tensors="pt", padding=True)
with torch.no_grad():
logits = model_asr(inputs.input_values, attention_mask=inputs.attention_mask).logits
predicted_ids = torch.argmax(logits, dim=-1)
return processor_asr.batch_decode(predicted_ids)[0]
def getUniform(text):
idx = 0
res = {}
for t in text:
raw = t["entity"].replace("B-","").replace("I-","")
word = t["word"].replace("▁","")
if "B-" in t["entity"]:
res[f"{raw}|{idx}"] = [word]
idx += 1
else:
res[f"{raw}|{idx}"].append(word)
res = [(r.split("|")[0], res[r]) for r in res]
return res
def predict(wav_file, lang_code):
if lang_code not in models_paths.keys():
return {
"The language code is unknown!"
}
text = transcribe(wav_file, lang_code).replace("apizza","a pizza") + " ."
intent_class = classifier_intent(text)[0]["label"]
language_class = classifier_language(text)[0]["label"]
named_entities = getUniform(predict_ner(text))
return {
"text": text,
"language": language_class,
"intent_class": intent_class,
"named_entities": named_entities,
}
iface = gr.Interface(
predict,
title='Alexa Clone 👩💼 🗪 🤖 Multilingual NLU',
description='Upload your wav file to test the models (First execution take about 20s to 30s, then next run in less than 1s)',
# thumbnail="",
inputs=[
gr.inputs.Audio(label='wav file', source='microphone', type='filepath'),
gr.inputs.Dropdown(choices=list(models_paths.keys())),
],
outputs=[
gr.outputs.JSON(label='ASR -> Slot Recognition + Intent Classification + Language Classification'),
],
examples=examples,
article='Made with ❤️ by Yanis Labrak thanks to 🤗',
)
iface.launch()