import gradio as gr from transformers import Wav2Vec2Processor from transformers import AutoModelForCTC from conversationalnlp.models.wav2vec2 import Wav2Vec2Predict from conversationalnlp.models.wav2vec2 import ModelLoader from conversationalnlp.utils import * import soundfile as sf import os """ run gradio with >>python app.py """ audiosavepath = os.getcwd() pretrained_model = "codenamewei/speech-to-text" processor = Wav2Vec2Processor.from_pretrained( pretrained_model) model = AutoModelForCTC.from_pretrained( pretrained_model) modelloader = ModelLoader(model, processor) predictor = Wav2Vec2Predict(modelloader) def greet(audioarray): """ audio array in the following format (16000, array([ -5277184, 326400, -120320, ..., -5970432, -12745216, -6934528], dtype=int32)) """ audioabspath = os.path.join(audiosavepath, "temp.wav") # WORKAROUND: Save to file and reread to get the array shape needed for prediction sf.write(audioabspath, audioarray[1], audioarray[0]) print(f"Audio at path {audioabspath}") predictiontexts = predictor.predictfiles([audioabspath]) outputtext = predictiontexts["predicted_text"][-1] + \ "\n" + predictiontexts["corrected_text"][-1] return outputtext demo = gr.Interface(fn=greet, inputs="audio", outputs="text", title="Speech-to-Text") demo.launch() # share=True)