Spaces:

sethuiyer
/

ttsdoc

Runtime error

App Files Files Community

Sethu Iyer commited on Aug 11

Commit

020af7d

•

1 Parent(s): 2da35dc

App added

Browse files

Files changed (3) hide show

README.md +42 -5
app.py +222 -0
requirements.txt +7 -0

README.md CHANGED Viewed

@@ -1,13 +1,50 @@
 ---
-title: Ttsdoc
-emoji: 📚
-colorFrom: blue
-colorTo: pink
 sdk: gradio
 sdk_version: 4.41.0
 app_file: app.py
 pinned: false
 license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: ttsdoc
+emoji: 🌖
+colorFrom: yellow
+colorTo: gray
 sdk: gradio
 sdk_version: 4.41.0
 app_file: app.py
 pinned: false
 license: apache-2.0
 ---
+# ttsdoc 🌖
+ttsdoc is a Text-to-Speech (TTS) application that can read your PDF documents aloud. It uses the Parler TTS Mini v1 model to generate high-quality audio from text inputs, including uploaded PDF files.
+## Features
+- 📄 Support for PDF, TXT, and DOCX file uploads
+- ✍️ Direct text input option
+- 🗣️ Customizable voice descriptions
+- ⏱️ Adjustable maximum audio duration
+- 🚀 GPU-accelerated audio generation
+## How to Use
+1. Upload a PDF, TXT, or DOCX file or enter text directly.
+2. Customize the voice description if desired.
+3. Adjust the maximum audio duration.
+4. Click "Generate Audio" to create the TTS output.
+## Tips for Best Results
+- For longer texts, the generator will create audio up to the specified maximum duration.
+- Experiment with different voice descriptions to achieve the desired output.
+- Use punctuation to control pacing and intonation in the generated speech.
+- For optimal quality, try to keep individual sentences or paragraphs concise.
+## Technical Details
+- This demo uses the Parler TTS Mini v1 model.
+- Audio generation is GPU-accelerated for faster processing.
+- Maximum file size for uploads: 5MB
+## License
+This project is licensed under the Apache 2.0 License.
+---
+Powered by [Gradio](https://gradio.app) and [Hugging Face](https://huggingface.co)

app.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import spaces
+import gradio as gr
+import torch
+from transformers import AutoTokenizer, AutoFeatureExtractor
+from parler_tts import ParlerTTSForConditionalGeneration
+import docx2txt
+from PyPDF2 import PdfReader
+import re
+import os
+from pydub import AudioSegment
+import tempfile
+# Global variables and model initialization
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+repo_id = "parler-tts/parler-tts-mini-v1"
+model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
+tokenizer = AutoTokenizer.from_pretrained(repo_id)
+feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
+SAMPLE_RATE = feature_extractor.sampling_rate
+def preprocess_text(text):
+    # Remove extra whitespace, normalize text, and handle numbers
+    text = re.sub(r'\s+', ' ', text).strip()
+    text = re.sub(r'\d+', lambda m: ' '.join(m.group(0)), text)
+    return text
+def extract_text_from_file(file):
+    if file.name.endswith('.txt'):
+        with open(file.name, 'r', encoding='utf-8') as f:
+            return f.read()
+    elif file.name.endswith('.docx'):
+        return docx2txt.process(file.name)
+    elif file.name.endswith('.pdf'):
+        with open(file.name, 'rb') as f:
+            reader = PdfReader(f)
+            return ' '.join([page.extract_text() for page in reader.pages])
+    else:
+        raise ValueError("Unsupported file type")
+def split_text_into_chunks(text, max_length=1000):
+    words = text.split()
+    chunks = []
+    current_chunk = []
+    current_length = 0
+    for word in words:
+        if current_length + len(word) + 1 > max_length:
+            chunks.append(' '.join(current_chunk))
+            current_chunk = [word]
+            current_length = len(word)
+        else:
+            current_chunk.append(word)
+            current_length += len(word) + 1
+    if current_chunk:
+        chunks.append(' '.join(current_chunk))
+    return chunks
+@spaces.GPU(duration=300)
+def generate_audio(text, description):
+    preprocessed_text = preprocess_text(text)
+    inputs = tokenizer(description.strip(), return_tensors="pt").to(device)
+    prompt = tokenizer(preprocessed_text, return_tensors="pt").to(device)
+    generation = model.generate(
+        input_ids=inputs.input_ids,
+        prompt_input_ids=prompt.input_ids,
+        attention_mask=inputs.attention_mask,
+        prompt_attention_mask=prompt.attention_mask,
+        do_sample=True,
+        temperature=1.0
+    )
+    audio_arr = generation.cpu().numpy().squeeze()
+    return SAMPLE_RATE, audio_arr
+def process_input(file, text_input, description, max_duration):
+    if file:
+        text = extract_text_from_file(file)
+    else:
+        text = text_input
+    if not text:
+        return None, "Please provide text input or upload a file."
+    try:
+        chunks = split_text_into_chunks(text)
+        audio_segments = []
+        total_duration = 0
+        for chunk in chunks:
+            audio = generate_audio(chunk, description)
+            segment = AudioSegment(
+                audio[1].tobytes(),
+                frame_rate=audio[0],
+                sample_width=2,
+                channels=1
+            )
+            chunk_duration = len(segment) / 1000  # Duration in seconds
+            if total_duration + chunk_duration > max_duration:
+                break
+            audio_segments.append(segment)
+            total_duration += chunk_duration
+        if not audio_segments:
+            return None, "Generated audio exceeds maximum duration. Please use shorter text."
+        combined_audio = sum(audio_segments)
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
+            combined_audio.export(temp_file.name, format="wav")
+            return temp_file.name, None
+    except Exception as e:
+        return None, f"Error generating audio: {str(e)}"
+def update_max_duration(file, text_input):
+    if file:
+        text = extract_text_from_file(file)
+    else:
+        text = text_input
+    if not text:
+        return gr.Slider.update(value=60)
+    estimated_duration = len(text.split()) / 3  # Rough estimate: 3 words per second
+    return gr.Slider.update(value=min(300, max(60, estimated_duration)))
+# Gradio interface
+css = """
+.container {
+    max-width: 850px;
+    margin: auto;
+    padding: 20px;
+    background-color: #f0f4f8;
+    border-radius: 12px;
+    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+}
+.input-area, .output-area {
+    background-color: white;
+    padding: 25px;
+    border-radius: 8px;
+    box-shadow: 0 2px 4px rgba(0,0,0,0.05);
+    margin-bottom: 20px;
+}
+.generate-btn {
+    background-color: #4CAF50 !important;
+    color: white !important;
+    padding: 10px 20px !important;
+    font-size: 16px !important;
+    font-weight: bold !important;
+    border-radius: 5px !important;
+    border: none !important;
+    cursor: pointer !important;
+    transition: background-color 0.3s !important;
+}
+.generate-btn:hover {
+    background-color: #45a049 !important;
+}
+"""
+with gr.Blocks(css=css) as demo:
+    gr.Markdown("# 🎙️ Parler TTS: Advanced Text-to-Speech Generator")
+    with gr.Row(elem_classes="container"):
+        with gr.Column(elem_classes="input-area"):
+            file_input = gr.File(label="📄 Upload File (TXT, DOCX, PDF)")
+            text_input = gr.Textbox(label="✍️ Or enter text here", lines=5, placeholder="Type or paste your text here...")
+            description = gr.Textbox(
+                label="🗣️ Voice Description",
+                lines=2,
+                value="A clear, neutral voice with minimal background noise.",
+                placeholder="Describe the voice characteristics you want..."
+            )
+            max_duration = gr.Slider(
+                minimum=10,
+                maximum=300,
+                value=60,
+                step=10,
+                label="⏱️ Maximum Audio Duration (seconds)"
+            )
+            submit_btn = gr.Button("🚀 Generate Audio", elem_classes="generate-btn")
+        with gr.Column(elem_classes="output-area"):
+            output_audio = gr.Audio(label="🔊 Generated Audio")
+            error_output = gr.Markdown()
+    file_input.change(
+        fn=update_max_duration,
+        inputs=[file_input, text_input],
+        outputs=[max_duration]
+    )
+    text_input.change(
+        fn=update_max_duration,
+        inputs=[file_input, text_input],
+        outputs=[max_duration]
+    )
+    submit_btn.click(
+        fn=process_input,
+        inputs=[file_input, text_input, description, max_duration],
+        outputs=[output_audio, error_output]
+    )
+    gr.Markdown(
+        """
+        ## 📌 Tips for Best Results
+        - For longer texts, the generator will create audio up to the specified maximum duration.
+        - Experiment with different voice descriptions to achieve the desired output.
+        - Use punctuation to control pacing and intonation in the generated speech.
+        - For optimal quality, try to keep individual sentences or paragraphs concise.
+        ## 🛠️ Technical Details
+        - This demo uses the Parler TTS Mini v1 model.
+        - Audio generation is GPU-accelerated for faster processing.
+        - Maximum file size for uploads: 5MB
+        """
+    )
+demo.queue()
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio==4.41.0
+torch
+transformers
+parler_tts
+docx2txt
+PyPDF2
+pydub