import streamlit as st import torch from parler_tts import ParlerTTSForConditionalGeneration from transformers import AutoTokenizer import soundfile as sf # Set up the device device = "cuda:0" if torch.cuda.is_available() else "cpu" # Load the model and tokenizer model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device) tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1") # Neon-themed styling st.markdown(""" """, unsafe_allow_html=True) st.title("🎤 Neon TTS Converter") # Predefined voice options voices = { "Smooth Female": "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch.", "Monotone Male": "Jon's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise.", "Energetic Youth": "An energetic young speaker with a lively tone and rapid speech, creating a sense of excitement.", "Calm Elderly": "An elderly speaker with a calm and slow-paced voice, bringing wisdom and serenity to the speech.", "Robotic": "A robotic, artificial voice with a consistent pitch and no variation in tone.", "Narrator": "A deep and clear voice, with a strong presence and a slightly slower pace, suitable for narrations.", "Whisper": "A soft, whispered voice, with very low volume and an intimate tone.", "Formal": "A formal, authoritative voice with clear articulation and a steady pace.", "Happy": "A cheerful, upbeat voice with a positive tone and lively intonation.", "Mysterious": "A mysterious and low-pitched voice, with slow delivery and a sense of intrigue.", "Bass-Heavy Male": "A deep, resonant male voice with a strong bass, ideal for dramatic and powerful delivery.", "Actor Voice 1": "An actor's voice with a dynamic range, capable of various emotional tones and expressions.", "Actor Voice 2": "A distinct and engaging actor's voice, providing a unique flair and character to the speech." } # Sidebar for voice selection st.sidebar.header("Select Voice") voice_choice = st.sidebar.selectbox("Choose a Voice", list(voices.keys())) # Display the selected voice description st.sidebar.markdown(f"**Description:** {voices[voice_choice]}") # Input for custom prompt st.sidebar.header("Custom Prompt") prompt = st.sidebar.text_area("Enter your custom prompt", value="Hey, how are you doing today?") # Error handling try: # Generate the TTS output if st.sidebar.button("Generate Speech"): description = voices[voice_choice] input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device) prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device) # Create attention masks attention_mask = tokenizer(description, return_tensors="pt").attention_mask.to(device) prompt_attention_mask = tokenizer(prompt, return_tensors="pt").attention_mask.to(device) # Generate speech generation = model.generate( input_ids=input_ids, prompt_input_ids=prompt_input_ids, attention_mask=attention_mask, prompt_attention_mask=prompt_attention_mask ) audio_arr = generation.cpu().numpy().squeeze() # Save the audio file output_file = "parler_tts_out.wav" sf.write(output_file, audio_arr, model.config.sampling_rate) # Display the audio player st.audio(output_file) st.success("Speech generation complete!") except Exception as e: st.error(f"An error occurred: {e}")