|
import gradio as gr |
|
from transformers import pipeline |
|
import numpy as np |
|
from ner import perform_ner |
|
from intent import perform_intent_classification |
|
|
|
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en") |
|
|
|
def transcribe(stream, new_chunk): |
|
transcription = "" |
|
sentence_buffer = "" |
|
results = [] |
|
sr, y = new_chunk |
|
y = y.astype(np.float32) |
|
y /= np.max(np.abs(y)) |
|
|
|
if stream is not None: |
|
stream = np.concatenate([stream, y]) |
|
else: |
|
stream = y |
|
print(transcriber({"sampling_rate": sr, "raw": stream})["text"]) |
|
transcription=transcriber({"sampling_rate": sr, "raw": stream})["text"] |
|
|
|
sentence_boundary = "." in transcription or "?" in transcription |
|
|
|
if sentence_boundary: |
|
sentence = sentence_buffer + transcription.split(transcription[-1])[0] |
|
print("Sentence Buffer :",sentence_buffer) |
|
print("Sentence :",sentence) |
|
ner_result = perform_ner(sentence) |
|
intent_result = perform_intent_classification(sentence) |
|
print("NER Result (sentence):", ner_result) |
|
print("Intent Result (sentence):", intent_result) |
|
sentence_buffer = transcription[-1] |
|
transcription = "" |
|
return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"], ner_result, intent_result |
|
|
|
demo = gr.Interface( |
|
transcribe,["state", gr.Audio(sources=["microphone"], streaming=True), |
|
], |
|
["state", gr.Text(label="Transcribe"), gr.Text(label="NER"), gr.Text(label="Intent")], |
|
live=True, |
|
) |
|
demo.launch(share=True) |