import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForPreTraining import soundfile as sf # Load the tokenizer and model for Bulgarian TTS (Text-to-Speech) tokenizer = AutoTokenizer.from_pretrained("Opit/mms_tts_bulgarian_finetuning") model = AutoModelForPreTraining.from_pretrained("Opit/mms_tts_bulgarian_finetuning") # TTS 변환 함수 (text-to-speech conversion) def tts_generate(text): inputs = tokenizer(text, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) # Convert the model outputs to audio format (you need to implement this depending on model specifics) # This will depend on how the model's outputs are structured # For now, let's assume you need a simple conversion to waveform/audio # Placeholder: Assuming `outputs` contains audio data that can be returned directly as .wav format # You might need to adjust this based on how the TTS model is structured and how it outputs speech audio = outputs['logits'] # Adjust according to your model's output structure # Return audio output (in numpy format) and the sample rate (this might be specific to your model) return audio.numpy(), 22050 # Assuming the output is sampled at 22050 Hz # Create Gradio interface iface = gr.Interface( fn=tts_generate, inputs="text", outputs="audio", title="Bulgarian TTS (Text-to-Speech)", description="Enter text to generate speech in Bulgarian." ) # Run the interface if __name__ == "__main__": iface.launch()