import gradio as gr from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline import scipy.io.wavfile import numpy as np # Load the MMS-TTS model and processor for Tibetan (bod) model_id = "ganga4364/mms-tts-bod-finetune-sherab" # Replace with your fine-tuned model if necessary # Use the text-to-speech pipeline with the model synthesiser = pipeline("text-to-speech", model_id) # add device=0 if you want to use a GPU # Function to perform TTS inference and save audio to a file def generate_audio(input_text): # Perform TTS inference speech = synthesiser(input_text) file_path = "finetuned_output.wav" # Save the audio to a file (e.g., 'output.wav') scipy.io.wavfile.write(file_path, rate=speech["sampling_rate"], data=speech["audio"][0]) # Return the path to the audio file return file_path # Create the Gradio interface iface = gr.Interface( fn=generate_audio, inputs="text", # Text input for the TTS outputs="audio", # Output will be an audio file title="Tibetan Text-to-Speech (MMS-TTS)", description="Enter Tibetan text and generate speech using MMS-TTS." ) # Launch the Gradio interface iface.launch()