ttsdoc / app.py
Sethu Iyer
App added
020af7d
import spaces
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoFeatureExtractor
from parler_tts import ParlerTTSForConditionalGeneration
import docx2txt
from PyPDF2 import PdfReader
import re
import os
from pydub import AudioSegment
import tempfile
# Global variables and model initialization
device = "cuda:0" if torch.cuda.is_available() else "cpu"
repo_id = "parler-tts/parler-tts-mini-v1"
model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(repo_id)
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
SAMPLE_RATE = feature_extractor.sampling_rate
def preprocess_text(text):
# Remove extra whitespace, normalize text, and handle numbers
text = re.sub(r'\s+', ' ', text).strip()
text = re.sub(r'\d+', lambda m: ' '.join(m.group(0)), text)
return text
def extract_text_from_file(file):
if file.name.endswith('.txt'):
with open(file.name, 'r', encoding='utf-8') as f:
return f.read()
elif file.name.endswith('.docx'):
return docx2txt.process(file.name)
elif file.name.endswith('.pdf'):
with open(file.name, 'rb') as f:
reader = PdfReader(f)
return ' '.join([page.extract_text() for page in reader.pages])
else:
raise ValueError("Unsupported file type")
def split_text_into_chunks(text, max_length=1000):
words = text.split()
chunks = []
current_chunk = []
current_length = 0
for word in words:
if current_length + len(word) + 1 > max_length:
chunks.append(' '.join(current_chunk))
current_chunk = [word]
current_length = len(word)
else:
current_chunk.append(word)
current_length += len(word) + 1
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
@spaces.GPU(duration=300)
def generate_audio(text, description):
preprocessed_text = preprocess_text(text)
inputs = tokenizer(description.strip(), return_tensors="pt").to(device)
prompt = tokenizer(preprocessed_text, return_tensors="pt").to(device)
generation = model.generate(
input_ids=inputs.input_ids,
prompt_input_ids=prompt.input_ids,
attention_mask=inputs.attention_mask,
prompt_attention_mask=prompt.attention_mask,
do_sample=True,
temperature=1.0
)
audio_arr = generation.cpu().numpy().squeeze()
return SAMPLE_RATE, audio_arr
def process_input(file, text_input, description, max_duration):
if file:
text = extract_text_from_file(file)
else:
text = text_input
if not text:
return None, "Please provide text input or upload a file."
try:
chunks = split_text_into_chunks(text)
audio_segments = []
total_duration = 0
for chunk in chunks:
audio = generate_audio(chunk, description)
segment = AudioSegment(
audio[1].tobytes(),
frame_rate=audio[0],
sample_width=2,
channels=1
)
chunk_duration = len(segment) / 1000 # Duration in seconds
if total_duration + chunk_duration > max_duration:
break
audio_segments.append(segment)
total_duration += chunk_duration
if not audio_segments:
return None, "Generated audio exceeds maximum duration. Please use shorter text."
combined_audio = sum(audio_segments)
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
combined_audio.export(temp_file.name, format="wav")
return temp_file.name, None
except Exception as e:
return None, f"Error generating audio: {str(e)}"
def update_max_duration(file, text_input):
if file:
text = extract_text_from_file(file)
else:
text = text_input
if not text:
return gr.Slider.update(value=60)
estimated_duration = len(text.split()) / 3 # Rough estimate: 3 words per second
return gr.Slider.update(value=min(300, max(60, estimated_duration)))
# Gradio interface
css = """
.container {
max-width: 850px;
margin: auto;
padding: 20px;
background-color: #f0f4f8;
border-radius: 12px;
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
}
.input-area, .output-area {
background-color: white;
padding: 25px;
border-radius: 8px;
box-shadow: 0 2px 4px rgba(0,0,0,0.05);
margin-bottom: 20px;
}
.generate-btn {
background-color: #4CAF50 !important;
color: white !important;
padding: 10px 20px !important;
font-size: 16px !important;
font-weight: bold !important;
border-radius: 5px !important;
border: none !important;
cursor: pointer !important;
transition: background-color 0.3s !important;
}
.generate-btn:hover {
background-color: #45a049 !important;
}
"""
with gr.Blocks(css=css) as demo:
gr.Markdown("# πŸŽ™οΈ Parler TTS: Advanced Text-to-Speech Generator")
with gr.Row(elem_classes="container"):
with gr.Column(elem_classes="input-area"):
file_input = gr.File(label="πŸ“„ Upload File (TXT, DOCX, PDF)")
text_input = gr.Textbox(label="✍️ Or enter text here", lines=5, placeholder="Type or paste your text here...")
description = gr.Textbox(
label="πŸ—£οΈ Voice Description",
lines=2,
value="A clear, neutral voice with minimal background noise.",
placeholder="Describe the voice characteristics you want..."
)
max_duration = gr.Slider(
minimum=10,
maximum=300,
value=60,
step=10,
label="⏱️ Maximum Audio Duration (seconds)"
)
submit_btn = gr.Button("πŸš€ Generate Audio", elem_classes="generate-btn")
with gr.Column(elem_classes="output-area"):
output_audio = gr.Audio(label="πŸ”Š Generated Audio")
error_output = gr.Markdown()
file_input.change(
fn=update_max_duration,
inputs=[file_input, text_input],
outputs=[max_duration]
)
text_input.change(
fn=update_max_duration,
inputs=[file_input, text_input],
outputs=[max_duration]
)
submit_btn.click(
fn=process_input,
inputs=[file_input, text_input, description, max_duration],
outputs=[output_audio, error_output]
)
gr.Markdown(
"""
## πŸ“Œ Tips for Best Results
- For longer texts, the generator will create audio up to the specified maximum duration.
- Experiment with different voice descriptions to achieve the desired output.
- Use punctuation to control pacing and intonation in the generated speech.
- For optimal quality, try to keep individual sentences or paragraphs concise.
## πŸ› οΈ Technical Details
- This demo uses the Parler TTS Mini v1 model.
- Audio generation is GPU-accelerated for faster processing.
- Maximum file size for uploads: 5MB
"""
)
demo.queue()
demo.launch()