Spaces:

sethuiyer
/

ttsdoc

Runtime error

File size: 7,335 Bytes

020af7d

import spaces
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoFeatureExtractor
from parler_tts import ParlerTTSForConditionalGeneration
import docx2txt
from PyPDF2 import PdfReader
import re
import os
from pydub import AudioSegment
import tempfile

# Global variables and model initialization
device = "cuda:0" if torch.cuda.is_available() else "cpu"
repo_id = "parler-tts/parler-tts-mini-v1"
model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(repo_id)
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
SAMPLE_RATE = feature_extractor.sampling_rate

def preprocess_text(text):
    # Remove extra whitespace, normalize text, and handle numbers
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'\d+', lambda m: ' '.join(m.group(0)), text)
    return text

def extract_text_from_file(file):
    if file.name.endswith('.txt'):
        with open(file.name, 'r', encoding='utf-8') as f:
            return f.read()
    elif file.name.endswith('.docx'):
        return docx2txt.process(file.name)
    elif file.name.endswith('.pdf'):
        with open(file.name, 'rb') as f:
            reader = PdfReader(f)
            return ' '.join([page.extract_text() for page in reader.pages])
    else:
        raise ValueError("Unsupported file type")

def split_text_into_chunks(text, max_length=1000):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        if current_length + len(word) + 1 > max_length:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = len(word)
        else:
            current_chunk.append(word)
            current_length += len(word) + 1

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

@spaces.GPU(duration=300)
def generate_audio(text, description):
    preprocessed_text = preprocess_text(text)
    inputs = tokenizer(description.strip(), return_tensors="pt").to(device)
    prompt = tokenizer(preprocessed_text, return_tensors="pt").to(device)

    generation = model.generate(
        input_ids=inputs.input_ids,
        prompt_input_ids=prompt.input_ids,
        attention_mask=inputs.attention_mask,
        prompt_attention_mask=prompt.attention_mask,
        do_sample=True,
        temperature=1.0
    )
    
    audio_arr = generation.cpu().numpy().squeeze()
    return SAMPLE_RATE, audio_arr

def process_input(file, text_input, description, max_duration):
    if file:
        text = extract_text_from_file(file)
    else:
        text = text_input
    
    if not text:
        return None, "Please provide text input or upload a file."
    
    try:
        chunks = split_text_into_chunks(text)
        audio_segments = []
        total_duration = 0
        
        for chunk in chunks:
            audio = generate_audio(chunk, description)
            segment = AudioSegment(
                audio[1].tobytes(),
                frame_rate=audio[0],
                sample_width=2,
                channels=1
            )
            
            chunk_duration = len(segment) / 1000  # Duration in seconds
            if total_duration + chunk_duration > max_duration:
                break
            
            audio_segments.append(segment)
            total_duration += chunk_duration
        
        if not audio_segments:
            return None, "Generated audio exceeds maximum duration. Please use shorter text."
        
        combined_audio = sum(audio_segments)
        
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
            combined_audio.export(temp_file.name, format="wav")
            return temp_file.name, None
    except Exception as e:
        return None, f"Error generating audio: {str(e)}"

def update_max_duration(file, text_input):
    if file:
        text = extract_text_from_file(file)
    else:
        text = text_input
    
    if not text:
        return gr.Slider.update(value=60)
    
    estimated_duration = len(text.split()) / 3  # Rough estimate: 3 words per second
    return gr.Slider.update(value=min(300, max(60, estimated_duration)))

# Gradio interface
css = """
.container {
    max-width: 850px;
    margin: auto;
    padding: 20px;
    background-color: #f0f4f8;
    border-radius: 12px;
    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
}
.input-area, .output-area {
    background-color: white;
    padding: 25px;
    border-radius: 8px;
    box-shadow: 0 2px 4px rgba(0,0,0,0.05);
    margin-bottom: 20px;
}
.generate-btn {
    background-color: #4CAF50 !important;
    color: white !important;
    padding: 10px 20px !important;
    font-size: 16px !important;
    font-weight: bold !important;
    border-radius: 5px !important;
    border: none !important;
    cursor: pointer !important;
    transition: background-color 0.3s !important;
}
.generate-btn:hover {
    background-color: #45a049 !important;
}
"""

with gr.Blocks(css=css) as demo:
    gr.Markdown("# 🎙️ Parler TTS: Advanced Text-to-Speech Generator")
    
    with gr.Row(elem_classes="container"):
        with gr.Column(elem_classes="input-area"):
            file_input = gr.File(label="📄 Upload File (TXT, DOCX, PDF)")
            text_input = gr.Textbox(label="✍️ Or enter text here", lines=5, placeholder="Type or paste your text here...")
            description = gr.Textbox(
                label="🗣️ Voice Description",
                lines=2,
                value="A clear, neutral voice with minimal background noise.",
                placeholder="Describe the voice characteristics you want..."
            )
            max_duration = gr.Slider(
                minimum=10,
                maximum=300,
                value=60,
                step=10,
                label="⏱️ Maximum Audio Duration (seconds)"
            )
            submit_btn = gr.Button("🚀 Generate Audio", elem_classes="generate-btn")
        
        with gr.Column(elem_classes="output-area"):
            output_audio = gr.Audio(label="🔊 Generated Audio")
            error_output = gr.Markdown()
    
    file_input.change(
        fn=update_max_duration,
        inputs=[file_input, text_input],
        outputs=[max_duration]
    )
    text_input.change(
        fn=update_max_duration,
        inputs=[file_input, text_input],
        outputs=[max_duration]
    )
    submit_btn.click(
        fn=process_input,
        inputs=[file_input, text_input, description, max_duration],
        outputs=[output_audio, error_output]
    )

    gr.Markdown(
        """
        ## 📌 Tips for Best Results
        - For longer texts, the generator will create audio up to the specified maximum duration.
        - Experiment with different voice descriptions to achieve the desired output.
        - Use punctuation to control pacing and intonation in the generated speech.
        - For optimal quality, try to keep individual sentences or paragraphs concise.
        
        ## 🛠️ Technical Details
        - This demo uses the Parler TTS Mini v1 model.
        - Audio generation is GPU-accelerated for faster processing.
        - Maximum file size for uploads: 5MB
        """
    )

demo.queue()
demo.launch()