import gradio as gr import tensorflow as tf from wav2vec2 import Wav2Vec2Processor, Wav2Vec2ForCTC if __name__ == '__main__': processor = Wav2Vec2Processor(is_tokenizer=False) tokenizer = Wav2Vec2Processor(is_tokenizer=True) model = Wav2Vec2ForCTC.from_pretrained("vasudevgupta/gsoc-wav2vec2-960h") def _forward(speech: tf.Tensor): speech = processor(speech)[None] tf_out = model(speech, training=False) return tf.squeeze(tf.argmax(tf_out, axis=-1)) def recognize_text(inputs): _, speech = inputs speech = tf.constant(speech, dtype=tf.float32) speech = tf.transpose(speech) tf_out = _forward(speech) return tokenizer.decode(tf_out.numpy().tolist()) gr.Interface(fn=recognize_text, inputs="audio", outputs="text").launch()