ganga4364's picture
Update app.py
2939710 verified
import gradio as gr
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import scipy.io.wavfile
import numpy as np
# Load the MMS-TTS model and processor for Tibetan (bod)
model_id = "ganga4364/mms-tts-bod-finetune-sherab" # Replace with your fine-tuned model if necessary
# Use the text-to-speech pipeline with the model
synthesiser = pipeline("text-to-speech", model_id) # add device=0 if you want to use a GPU
# Function to perform TTS inference and save audio to a file
def generate_audio(input_text):
# Perform TTS inference
speech = synthesiser(input_text)
file_path = "finetuned_output.wav"
# Save the audio to a file (e.g., 'output.wav')
scipy.io.wavfile.write(file_path, rate=speech["sampling_rate"], data=speech["audio"][0])
# Return the path to the audio file
return file_path
# Create the Gradio interface
iface = gr.Interface(
fn=generate_audio,
inputs="text", # Text input for the TTS
outputs="audio", # Output will be an audio file
title="Tibetan Text-to-Speech (MMS-TTS)",
description="Enter Tibetan text and generate speech using MMS-TTS."
)
# Launch the Gradio interface
iface.launch()