import gradio as gr
import librosa
import numpy as np
import torch
from transformers import pipeline


language_classes = {
    0: "Arabic",
    1: "Basque",
    2: "Breton",
    3: "Catalan",
    4: "Chinese_China",
    5: "Chinese_Hongkong",
    6: "Chinese_Taiwan",
    7: "Chuvash",
    8: "Czech",
    9: "Dhivehi",
    10: "Dutch",
    11: "English",
    12: "Esperanto",
    13: "Estonian",
    14: "French",
    15: "Frisian",
    16: "Georgian",
    17: "German",
    18: "Greek",
    19: "Hakha_Chin",
    20: "Indonesian",
    21: "Interlingua",
    22: "Italian",
    23: "Japanese",
    24: "Kabyle",
    25: "Kinyarwanda",
    26: "Kyrgyz",
    27: "Latvian",
    28: "Maltese",
    29: "Mongolian",
    30: "Persian",
    31: "Polish",
    32: "Portuguese",
    33: "Romanian",
    34: "Romansh_Sursilvan",
    35: "Russian",
    36: "Sakha",
    37: "Slovenian",
    38: "Spanish",
    39: "Swedish",
    40: "Tamil",
    41: "Tatar",
    42: "Turkish",
    43: "Ukranian",
    44: "Welsh"
}


username = "jpbello"  ## Complete your username
model_id = "jpbello/Hubert_emotion-finetuned-common_language"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline("audio-classification", model=model_id, device=device)

# def predict_trunc(filepath):
#     preprocessed = pipe.preprocess(filepath)
#     truncated = pipe.feature_extractor.pad(preprocessed,truncation=True, max_length = 16_000*30)
#     model_outputs = pipe.forward(truncated)
#     outputs = pipe.postprocess(model_outputs)

#     return outputs


def classify_audio(filepath):
    
    
    preds = pipe(filepath)
    # preds = predict_trunc(filepath)
    outputs = {}
    for p in preds:
        outputs[p["label"]] = p["score"]
    return outputs


title = "Language Classification Model"
description = (
    "Welcome to the Language Classification Model demo powered by Gradio and Hubert Emotion. "
    "This model is trained to identify the language spoken in audio samples, making it a valuable tool "
    "for language identification tasks. Upload an audio file, and let the model predict the spoken language "
    "with confidence scores. Try it out with our provided example audio files to see the model in action!"
    
)
filenames = ['EN_0212.wav', "FR_0061.wav", "JP_0100.wav","AR_0019.wav"]
filenames = [[f"./{f}"] for f in filenames]
demo = gr.Interface(
    fn=classify_audio,
    inputs=gr.Audio(type="filepath"),
    outputs=[gr.Label(label="Predictions")],
    title=title,
    description=description,
    examples=filenames,
)
demo.launch()