from transformers import pipeline, AutoFeatureExtractor, AutoTokenizer, Wav2Vec2ForCTC import gradio as gr import time model_id = 'comodoro/wav2vec2-xls-r-300m-cs-250' feature_extractor = AutoFeatureExtractor.from_pretrained(model_id) model = Wav2Vec2ForCTC.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) p = pipeline("automatic-speech-recognition", chunk_length_s=5, model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) def transcribe(audio, state=""): time.sleep(2) text = p(audio)["text"] state += text + " " return state with gr.Blocks() as blocks: audio = gr.Audio(source="microphone", type="filepath", label='Pokud je to třeba, povolte mikrofon pro tuto stránku, \ klikněte na Record from microphone, po dokončení nahrávání na Stop recording a poté na Rozpoznat') btn = gr.Button('Rozpoznat') output = gr.Textbox(show_label=False) btn.click(fn=transcribe, inputs=[audio,], outputs=[output,]) blocks.launch(enable_queue=True, debug=True)