import os 
import sys 
os.system("pip install transformers==4.27.0")
from transformers import pipeline, WhisperModel, WhisperTokenizer
os.system("pip install evaluate")
os.system("pip install datasets")
os.system("pip install llvmlite")
os.system("pip install spicy")
os.system("pip install soundfile")
os.system("pip install datasets[audio]")
#os.system("pip install numpy==1.21.4")
#os.system("pip install numpy==1.22.1")
os.system("pip install numba==0.51.2")
from evaluate import evaluator
from datasets import load_dataset, Audio
from datasets import disable_caching
from datasets import set_caching_enabled
set_caching_enabled(False)
disable_caching()

from transformers import pipeline, WhisperModel, WhisperTokenizer, AutoConfig
from datasets import load_dataset


metric = evaluate.load("wer")

# Load the Whisper model and tokenizer
huggingface_token = os.environ["huggingface_token"]
whisper_miso = WhisperModel.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token)
miso_tokenizer = WhisperTokenizer.from_pretrained("mskov/whisper_miso", use_auth_token=huggingface_token)

# Initialize the automatic-speech-recognition pipeline with the Whisper model and tokenizer
asr_pipeline = pipeline(
    "automatic-speech-recognition",
    model=whisper_miso,
    tokenizer=miso_tokenizer
)

# Load the dataset
dataset = load_dataset("mskov/miso_test", split="test").cast_column("audio", Audio())

# Compute the evaluation results
results = asr_pipeline(dataset)
metric = WERMetric()
wer = metric.compute(predictions=results, references=dataset["audio"])
print(wer)


def transcribe(audio, state=""):
    text = p(audio)["text"]
    state += text + " "
    return state, state

gr.Interface(
    fn=transcribe, 
    inputs=[
        gr.Audio(source="microphone", type="filepath", streaming=True), 
        "state"
    ],
    outputs=[
        "textbox",
        "state"
    ],
    live=True).launch()