import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForPreTraining
import soundfile as sf

# Load the tokenizer and model for Bulgarian TTS (Text-to-Speech)
tokenizer = AutoTokenizer.from_pretrained("Opit/mms_tts_bulgarian_finetuning")
model = AutoModelForPreTraining.from_pretrained("Opit/mms_tts_bulgarian_finetuning")

# TTS 변환 함수 (text-to-speech conversion)
def tts_generate(text):
    inputs = tokenizer(text, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    # Convert the model outputs to audio format (you need to implement this depending on model specifics)
    # This will depend on how the model's outputs are structured
    # For now, let's assume you need a simple conversion to waveform/audio

    # Placeholder: Assuming `outputs` contains audio data that can be returned directly as .wav format
    # You might need to adjust this based on how the TTS model is structured and how it outputs speech
    audio = outputs['logits']  # Adjust according to your model's output structure

    # Return audio output (in numpy format) and the sample rate (this might be specific to your model)
    return audio.numpy(), 22050  # Assuming the output is sampled at 22050 Hz

# Create Gradio interface
iface = gr.Interface(
    fn=tts_generate,
    inputs="text",
    outputs="audio",
    title="Bulgarian TTS (Text-to-Speech)",
    description="Enter text to generate speech in Bulgarian."
)

# Run the interface
if __name__ == "__main__":
    iface.launch()