Sethu Iyer commited on
Commit
020af7d
β€’
1 Parent(s): 2da35dc
Files changed (3) hide show
  1. README.md +42 -5
  2. app.py +222 -0
  3. requirements.txt +7 -0
README.md CHANGED
@@ -1,13 +1,50 @@
1
  ---
2
- title: Ttsdoc
3
- emoji: πŸ“š
4
- colorFrom: blue
5
- colorTo: pink
6
  sdk: gradio
7
  sdk_version: 4.41.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
  ---
 
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: ttsdoc
3
+ emoji: πŸŒ–
4
+ colorFrom: yellow
5
+ colorTo: gray
6
  sdk: gradio
7
  sdk_version: 4.41.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
  ---
12
+ # ttsdoc πŸŒ–
13
 
14
+ ttsdoc is a Text-to-Speech (TTS) application that can read your PDF documents aloud. It uses the Parler TTS Mini v1 model to generate high-quality audio from text inputs, including uploaded PDF files.
15
+
16
+ ## Features
17
+
18
+ - πŸ“„ Support for PDF, TXT, and DOCX file uploads
19
+ - ✍️ Direct text input option
20
+ - πŸ—£οΈ Customizable voice descriptions
21
+ - ⏱️ Adjustable maximum audio duration
22
+ - πŸš€ GPU-accelerated audio generation
23
+
24
+ ## How to Use
25
+
26
+ 1. Upload a PDF, TXT, or DOCX file or enter text directly.
27
+ 2. Customize the voice description if desired.
28
+ 3. Adjust the maximum audio duration.
29
+ 4. Click "Generate Audio" to create the TTS output.
30
+
31
+ ## Tips for Best Results
32
+
33
+ - For longer texts, the generator will create audio up to the specified maximum duration.
34
+ - Experiment with different voice descriptions to achieve the desired output.
35
+ - Use punctuation to control pacing and intonation in the generated speech.
36
+ - For optimal quality, try to keep individual sentences or paragraphs concise.
37
+
38
+ ## Technical Details
39
+
40
+ - This demo uses the Parler TTS Mini v1 model.
41
+ - Audio generation is GPU-accelerated for faster processing.
42
+ - Maximum file size for uploads: 5MB
43
+
44
+ ## License
45
+
46
+ This project is licensed under the Apache 2.0 License.
47
+
48
+ ---
49
+
50
+ Powered by [Gradio](https://gradio.app) and [Hugging Face](https://huggingface.co)
app.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import gradio as gr
3
+ import torch
4
+ from transformers import AutoTokenizer, AutoFeatureExtractor
5
+ from parler_tts import ParlerTTSForConditionalGeneration
6
+ import docx2txt
7
+ from PyPDF2 import PdfReader
8
+ import re
9
+ import os
10
+ from pydub import AudioSegment
11
+ import tempfile
12
+
13
+ # Global variables and model initialization
14
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
15
+ repo_id = "parler-tts/parler-tts-mini-v1"
16
+ model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
17
+ tokenizer = AutoTokenizer.from_pretrained(repo_id)
18
+ feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
19
+ SAMPLE_RATE = feature_extractor.sampling_rate
20
+
21
+ def preprocess_text(text):
22
+ # Remove extra whitespace, normalize text, and handle numbers
23
+ text = re.sub(r'\s+', ' ', text).strip()
24
+ text = re.sub(r'\d+', lambda m: ' '.join(m.group(0)), text)
25
+ return text
26
+
27
+ def extract_text_from_file(file):
28
+ if file.name.endswith('.txt'):
29
+ with open(file.name, 'r', encoding='utf-8') as f:
30
+ return f.read()
31
+ elif file.name.endswith('.docx'):
32
+ return docx2txt.process(file.name)
33
+ elif file.name.endswith('.pdf'):
34
+ with open(file.name, 'rb') as f:
35
+ reader = PdfReader(f)
36
+ return ' '.join([page.extract_text() for page in reader.pages])
37
+ else:
38
+ raise ValueError("Unsupported file type")
39
+
40
+ def split_text_into_chunks(text, max_length=1000):
41
+ words = text.split()
42
+ chunks = []
43
+ current_chunk = []
44
+ current_length = 0
45
+
46
+ for word in words:
47
+ if current_length + len(word) + 1 > max_length:
48
+ chunks.append(' '.join(current_chunk))
49
+ current_chunk = [word]
50
+ current_length = len(word)
51
+ else:
52
+ current_chunk.append(word)
53
+ current_length += len(word) + 1
54
+
55
+ if current_chunk:
56
+ chunks.append(' '.join(current_chunk))
57
+
58
+ return chunks
59
+
60
+ @spaces.GPU(duration=300)
61
+ def generate_audio(text, description):
62
+ preprocessed_text = preprocess_text(text)
63
+ inputs = tokenizer(description.strip(), return_tensors="pt").to(device)
64
+ prompt = tokenizer(preprocessed_text, return_tensors="pt").to(device)
65
+
66
+ generation = model.generate(
67
+ input_ids=inputs.input_ids,
68
+ prompt_input_ids=prompt.input_ids,
69
+ attention_mask=inputs.attention_mask,
70
+ prompt_attention_mask=prompt.attention_mask,
71
+ do_sample=True,
72
+ temperature=1.0
73
+ )
74
+
75
+ audio_arr = generation.cpu().numpy().squeeze()
76
+ return SAMPLE_RATE, audio_arr
77
+
78
+ def process_input(file, text_input, description, max_duration):
79
+ if file:
80
+ text = extract_text_from_file(file)
81
+ else:
82
+ text = text_input
83
+
84
+ if not text:
85
+ return None, "Please provide text input or upload a file."
86
+
87
+ try:
88
+ chunks = split_text_into_chunks(text)
89
+ audio_segments = []
90
+ total_duration = 0
91
+
92
+ for chunk in chunks:
93
+ audio = generate_audio(chunk, description)
94
+ segment = AudioSegment(
95
+ audio[1].tobytes(),
96
+ frame_rate=audio[0],
97
+ sample_width=2,
98
+ channels=1
99
+ )
100
+
101
+ chunk_duration = len(segment) / 1000 # Duration in seconds
102
+ if total_duration + chunk_duration > max_duration:
103
+ break
104
+
105
+ audio_segments.append(segment)
106
+ total_duration += chunk_duration
107
+
108
+ if not audio_segments:
109
+ return None, "Generated audio exceeds maximum duration. Please use shorter text."
110
+
111
+ combined_audio = sum(audio_segments)
112
+
113
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
114
+ combined_audio.export(temp_file.name, format="wav")
115
+ return temp_file.name, None
116
+ except Exception as e:
117
+ return None, f"Error generating audio: {str(e)}"
118
+
119
+ def update_max_duration(file, text_input):
120
+ if file:
121
+ text = extract_text_from_file(file)
122
+ else:
123
+ text = text_input
124
+
125
+ if not text:
126
+ return gr.Slider.update(value=60)
127
+
128
+ estimated_duration = len(text.split()) / 3 # Rough estimate: 3 words per second
129
+ return gr.Slider.update(value=min(300, max(60, estimated_duration)))
130
+
131
+ # Gradio interface
132
+ css = """
133
+ .container {
134
+ max-width: 850px;
135
+ margin: auto;
136
+ padding: 20px;
137
+ background-color: #f0f4f8;
138
+ border-radius: 12px;
139
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
140
+ }
141
+ .input-area, .output-area {
142
+ background-color: white;
143
+ padding: 25px;
144
+ border-radius: 8px;
145
+ box-shadow: 0 2px 4px rgba(0,0,0,0.05);
146
+ margin-bottom: 20px;
147
+ }
148
+ .generate-btn {
149
+ background-color: #4CAF50 !important;
150
+ color: white !important;
151
+ padding: 10px 20px !important;
152
+ font-size: 16px !important;
153
+ font-weight: bold !important;
154
+ border-radius: 5px !important;
155
+ border: none !important;
156
+ cursor: pointer !important;
157
+ transition: background-color 0.3s !important;
158
+ }
159
+ .generate-btn:hover {
160
+ background-color: #45a049 !important;
161
+ }
162
+ """
163
+
164
+ with gr.Blocks(css=css) as demo:
165
+ gr.Markdown("# πŸŽ™οΈ Parler TTS: Advanced Text-to-Speech Generator")
166
+
167
+ with gr.Row(elem_classes="container"):
168
+ with gr.Column(elem_classes="input-area"):
169
+ file_input = gr.File(label="πŸ“„ Upload File (TXT, DOCX, PDF)")
170
+ text_input = gr.Textbox(label="✍️ Or enter text here", lines=5, placeholder="Type or paste your text here...")
171
+ description = gr.Textbox(
172
+ label="πŸ—£οΈ Voice Description",
173
+ lines=2,
174
+ value="A clear, neutral voice with minimal background noise.",
175
+ placeholder="Describe the voice characteristics you want..."
176
+ )
177
+ max_duration = gr.Slider(
178
+ minimum=10,
179
+ maximum=300,
180
+ value=60,
181
+ step=10,
182
+ label="⏱️ Maximum Audio Duration (seconds)"
183
+ )
184
+ submit_btn = gr.Button("πŸš€ Generate Audio", elem_classes="generate-btn")
185
+
186
+ with gr.Column(elem_classes="output-area"):
187
+ output_audio = gr.Audio(label="πŸ”Š Generated Audio")
188
+ error_output = gr.Markdown()
189
+
190
+ file_input.change(
191
+ fn=update_max_duration,
192
+ inputs=[file_input, text_input],
193
+ outputs=[max_duration]
194
+ )
195
+ text_input.change(
196
+ fn=update_max_duration,
197
+ inputs=[file_input, text_input],
198
+ outputs=[max_duration]
199
+ )
200
+ submit_btn.click(
201
+ fn=process_input,
202
+ inputs=[file_input, text_input, description, max_duration],
203
+ outputs=[output_audio, error_output]
204
+ )
205
+
206
+ gr.Markdown(
207
+ """
208
+ ## πŸ“Œ Tips for Best Results
209
+ - For longer texts, the generator will create audio up to the specified maximum duration.
210
+ - Experiment with different voice descriptions to achieve the desired output.
211
+ - Use punctuation to control pacing and intonation in the generated speech.
212
+ - For optimal quality, try to keep individual sentences or paragraphs concise.
213
+
214
+ ## πŸ› οΈ Technical Details
215
+ - This demo uses the Parler TTS Mini v1 model.
216
+ - Audio generation is GPU-accelerated for faster processing.
217
+ - Maximum file size for uploads: 5MB
218
+ """
219
+ )
220
+
221
+ demo.queue()
222
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio==4.41.0
2
+ torch
3
+ transformers
4
+ parler_tts
5
+ docx2txt
6
+ PyPDF2
7
+ pydub