|
import spaces |
|
import gradio as gr |
|
import torch |
|
from transformers import AutoTokenizer, AutoFeatureExtractor |
|
from parler_tts import ParlerTTSForConditionalGeneration |
|
import docx2txt |
|
from PyPDF2 import PdfReader |
|
import re |
|
import os |
|
from pydub import AudioSegment |
|
import tempfile |
|
|
|
|
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
repo_id = "parler-tts/parler-tts-mini-v1" |
|
model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device) |
|
tokenizer = AutoTokenizer.from_pretrained(repo_id) |
|
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id) |
|
SAMPLE_RATE = feature_extractor.sampling_rate |
|
|
|
def preprocess_text(text): |
|
|
|
text = re.sub(r'\s+', ' ', text).strip() |
|
text = re.sub(r'\d+', lambda m: ' '.join(m.group(0)), text) |
|
return text |
|
|
|
def extract_text_from_file(file): |
|
if file.name.endswith('.txt'): |
|
with open(file.name, 'r', encoding='utf-8') as f: |
|
return f.read() |
|
elif file.name.endswith('.docx'): |
|
return docx2txt.process(file.name) |
|
elif file.name.endswith('.pdf'): |
|
with open(file.name, 'rb') as f: |
|
reader = PdfReader(f) |
|
return ' '.join([page.extract_text() for page in reader.pages]) |
|
else: |
|
raise ValueError("Unsupported file type") |
|
|
|
def split_text_into_chunks(text, max_length=1000): |
|
words = text.split() |
|
chunks = [] |
|
current_chunk = [] |
|
current_length = 0 |
|
|
|
for word in words: |
|
if current_length + len(word) + 1 > max_length: |
|
chunks.append(' '.join(current_chunk)) |
|
current_chunk = [word] |
|
current_length = len(word) |
|
else: |
|
current_chunk.append(word) |
|
current_length += len(word) + 1 |
|
|
|
if current_chunk: |
|
chunks.append(' '.join(current_chunk)) |
|
|
|
return chunks |
|
|
|
@spaces.GPU(duration=300) |
|
def generate_audio(text, description): |
|
preprocessed_text = preprocess_text(text) |
|
inputs = tokenizer(description.strip(), return_tensors="pt").to(device) |
|
prompt = tokenizer(preprocessed_text, return_tensors="pt").to(device) |
|
|
|
generation = model.generate( |
|
input_ids=inputs.input_ids, |
|
prompt_input_ids=prompt.input_ids, |
|
attention_mask=inputs.attention_mask, |
|
prompt_attention_mask=prompt.attention_mask, |
|
do_sample=True, |
|
temperature=1.0 |
|
) |
|
|
|
audio_arr = generation.cpu().numpy().squeeze() |
|
return SAMPLE_RATE, audio_arr |
|
|
|
def process_input(file, text_input, description, max_duration): |
|
if file: |
|
text = extract_text_from_file(file) |
|
else: |
|
text = text_input |
|
|
|
if not text: |
|
return None, "Please provide text input or upload a file." |
|
|
|
try: |
|
chunks = split_text_into_chunks(text) |
|
audio_segments = [] |
|
total_duration = 0 |
|
|
|
for chunk in chunks: |
|
audio = generate_audio(chunk, description) |
|
segment = AudioSegment( |
|
audio[1].tobytes(), |
|
frame_rate=audio[0], |
|
sample_width=2, |
|
channels=1 |
|
) |
|
|
|
chunk_duration = len(segment) / 1000 |
|
if total_duration + chunk_duration > max_duration: |
|
break |
|
|
|
audio_segments.append(segment) |
|
total_duration += chunk_duration |
|
|
|
if not audio_segments: |
|
return None, "Generated audio exceeds maximum duration. Please use shorter text." |
|
|
|
combined_audio = sum(audio_segments) |
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file: |
|
combined_audio.export(temp_file.name, format="wav") |
|
return temp_file.name, None |
|
except Exception as e: |
|
return None, f"Error generating audio: {str(e)}" |
|
|
|
def update_max_duration(file, text_input): |
|
if file: |
|
text = extract_text_from_file(file) |
|
else: |
|
text = text_input |
|
|
|
if not text: |
|
return gr.Slider.update(value=60) |
|
|
|
estimated_duration = len(text.split()) / 3 |
|
return gr.Slider.update(value=min(300, max(60, estimated_duration))) |
|
|
|
|
|
css = """ |
|
.container { |
|
max-width: 850px; |
|
margin: auto; |
|
padding: 20px; |
|
background-color: #f0f4f8; |
|
border-radius: 12px; |
|
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); |
|
} |
|
.input-area, .output-area { |
|
background-color: white; |
|
padding: 25px; |
|
border-radius: 8px; |
|
box-shadow: 0 2px 4px rgba(0,0,0,0.05); |
|
margin-bottom: 20px; |
|
} |
|
.generate-btn { |
|
background-color: #4CAF50 !important; |
|
color: white !important; |
|
padding: 10px 20px !important; |
|
font-size: 16px !important; |
|
font-weight: bold !important; |
|
border-radius: 5px !important; |
|
border: none !important; |
|
cursor: pointer !important; |
|
transition: background-color 0.3s !important; |
|
} |
|
.generate-btn:hover { |
|
background-color: #45a049 !important; |
|
} |
|
""" |
|
|
|
with gr.Blocks(css=css) as demo: |
|
gr.Markdown("# ποΈ Parler TTS: Advanced Text-to-Speech Generator") |
|
|
|
with gr.Row(elem_classes="container"): |
|
with gr.Column(elem_classes="input-area"): |
|
file_input = gr.File(label="π Upload File (TXT, DOCX, PDF)") |
|
text_input = gr.Textbox(label="βοΈ Or enter text here", lines=5, placeholder="Type or paste your text here...") |
|
description = gr.Textbox( |
|
label="π£οΈ Voice Description", |
|
lines=2, |
|
value="A clear, neutral voice with minimal background noise.", |
|
placeholder="Describe the voice characteristics you want..." |
|
) |
|
max_duration = gr.Slider( |
|
minimum=10, |
|
maximum=300, |
|
value=60, |
|
step=10, |
|
label="β±οΈ Maximum Audio Duration (seconds)" |
|
) |
|
submit_btn = gr.Button("π Generate Audio", elem_classes="generate-btn") |
|
|
|
with gr.Column(elem_classes="output-area"): |
|
output_audio = gr.Audio(label="π Generated Audio") |
|
error_output = gr.Markdown() |
|
|
|
file_input.change( |
|
fn=update_max_duration, |
|
inputs=[file_input, text_input], |
|
outputs=[max_duration] |
|
) |
|
text_input.change( |
|
fn=update_max_duration, |
|
inputs=[file_input, text_input], |
|
outputs=[max_duration] |
|
) |
|
submit_btn.click( |
|
fn=process_input, |
|
inputs=[file_input, text_input, description, max_duration], |
|
outputs=[output_audio, error_output] |
|
) |
|
|
|
gr.Markdown( |
|
""" |
|
## π Tips for Best Results |
|
- For longer texts, the generator will create audio up to the specified maximum duration. |
|
- Experiment with different voice descriptions to achieve the desired output. |
|
- Use punctuation to control pacing and intonation in the generated speech. |
|
- For optimal quality, try to keep individual sentences or paragraphs concise. |
|
|
|
## π οΈ Technical Details |
|
- This demo uses the Parler TTS Mini v1 model. |
|
- Audio generation is GPU-accelerated for faster processing. |
|
- Maximum file size for uploads: 5MB |
|
""" |
|
) |
|
|
|
demo.queue() |
|
demo.launch() |
|
|