TextToAudio / app.py
ibrahim313's picture
Update app.py
562fd62 verified
import streamlit as st
import torch
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf
# Set up the device
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# Load the model and tokenizer
model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device)
tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
# Neon-themed styling
st.markdown("""
<style>
body {
background-color: #0f0f0f;
color: #0fff0f;
}
.stTextInput, .stTextArea {
background-color: #333333;
color: #0fff0f;
}
.stButton > button {
background-color: #0fff0f;
color: #0f0f0f;
}
</style>
""", unsafe_allow_html=True)
st.title("🎀 Neon TTS Converter")
# Predefined voice options
voices = {
"Smooth Female": "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch.",
"Monotone Male": "Jon's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise.",
"Energetic Youth": "An energetic young speaker with a lively tone and rapid speech, creating a sense of excitement.",
"Calm Elderly": "An elderly speaker with a calm and slow-paced voice, bringing wisdom and serenity to the speech.",
"Robotic": "A robotic, artificial voice with a consistent pitch and no variation in tone.",
"Narrator": "A deep and clear voice, with a strong presence and a slightly slower pace, suitable for narrations.",
"Whisper": "A soft, whispered voice, with very low volume and an intimate tone.",
"Formal": "A formal, authoritative voice with clear articulation and a steady pace.",
"Happy": "A cheerful, upbeat voice with a positive tone and lively intonation.",
"Mysterious": "A mysterious and low-pitched voice, with slow delivery and a sense of intrigue.",
"Bass-Heavy Male": "A deep, resonant male voice with a strong bass, ideal for dramatic and powerful delivery.",
"Actor Voice 1": "An actor's voice with a dynamic range, capable of various emotional tones and expressions.",
"Actor Voice 2": "A distinct and engaging actor's voice, providing a unique flair and character to the speech."
}
# Sidebar for voice selection
st.sidebar.header("Select Voice")
voice_choice = st.sidebar.selectbox("Choose a Voice", list(voices.keys()))
# Display the selected voice description
st.sidebar.markdown(f"**Description:** {voices[voice_choice]}")
# Input for custom prompt
st.sidebar.header("Custom Prompt")
prompt = st.sidebar.text_area("Enter your custom prompt", value="Hey, how are you doing today?")
# Error handling
try:
# Generate the TTS output
if st.sidebar.button("Generate Speech"):
description = voices[voice_choice]
input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
# Create attention masks
attention_mask = tokenizer(description, return_tensors="pt").attention_mask.to(device)
prompt_attention_mask = tokenizer(prompt, return_tensors="pt").attention_mask.to(device)
# Generate speech
generation = model.generate(
input_ids=input_ids,
prompt_input_ids=prompt_input_ids,
attention_mask=attention_mask,
prompt_attention_mask=prompt_attention_mask
)
audio_arr = generation.cpu().numpy().squeeze()
# Save the audio file
output_file = "parler_tts_out.wav"
sf.write(output_file, audio_arr, model.config.sampling_rate)
# Display the audio player
st.audio(output_file)
st.success("Speech generation complete!")
except Exception as e:
st.error(f"An error occurred: {e}")