Spaces:

sethuiyer
/

ttsdoc

Runtime error

ttsdoc / app.py

Sethu Iyer

App added

020af7d 3 months ago

7.34 kB

	import spaces
	import gradio as gr
	import torch
	from transformers import AutoTokenizer, AutoFeatureExtractor
	from parler_tts import ParlerTTSForConditionalGeneration
	import docx2txt
	from PyPDF2 import PdfReader
	import re
	import os
	from pydub import AudioSegment
	import tempfile

	# Global variables and model initialization
	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	repo_id = "parler-tts/parler-tts-mini-v1"
	model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
	tokenizer = AutoTokenizer.from_pretrained(repo_id)
	feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
	SAMPLE_RATE = feature_extractor.sampling_rate

	def preprocess_text(text):
	# Remove extra whitespace, normalize text, and handle numbers
	text = re.sub(r'\s+', ' ', text).strip()
	text = re.sub(r'\d+', lambda m: ' '.join(m.group(0)), text)
	return text

	def extract_text_from_file(file):
	if file.name.endswith('.txt'):
	with open(file.name, 'r', encoding='utf-8') as f:
	return f.read()
	elif file.name.endswith('.docx'):
	return docx2txt.process(file.name)
	elif file.name.endswith('.pdf'):
	with open(file.name, 'rb') as f:
	reader = PdfReader(f)
	return ' '.join([page.extract_text() for page in reader.pages])
	else:
	raise ValueError("Unsupported file type")

	def split_text_into_chunks(text, max_length=1000):
	words = text.split()
	chunks = []
	current_chunk = []
	current_length = 0

	for word in words:
	if current_length + len(word) + 1 > max_length:
	chunks.append(' '.join(current_chunk))
	current_chunk = [word]
	current_length = len(word)
	else:
	current_chunk.append(word)
	current_length += len(word) + 1

	if current_chunk:
	chunks.append(' '.join(current_chunk))

	return chunks

	@spaces.GPU(duration=300)
	def generate_audio(text, description):
	preprocessed_text = preprocess_text(text)
	inputs = tokenizer(description.strip(), return_tensors="pt").to(device)
	prompt = tokenizer(preprocessed_text, return_tensors="pt").to(device)

	generation = model.generate(
	input_ids=inputs.input_ids,
	prompt_input_ids=prompt.input_ids,
	attention_mask=inputs.attention_mask,
	prompt_attention_mask=prompt.attention_mask,
	do_sample=True,
	temperature=1.0
	)

	audio_arr = generation.cpu().numpy().squeeze()
	return SAMPLE_RATE, audio_arr

	def process_input(file, text_input, description, max_duration):
	if file:
	text = extract_text_from_file(file)
	else:
	text = text_input

	if not text:
	return None, "Please provide text input or upload a file."

	try:
	chunks = split_text_into_chunks(text)
	audio_segments = []
	total_duration = 0

	for chunk in chunks:
	audio = generate_audio(chunk, description)
	segment = AudioSegment(
	audio[1].tobytes(),
	frame_rate=audio[0],
	sample_width=2,
	channels=1
	)

	chunk_duration = len(segment) / 1000 # Duration in seconds
	if total_duration + chunk_duration > max_duration:
	break

	audio_segments.append(segment)
	total_duration += chunk_duration

	if not audio_segments:
	return None, "Generated audio exceeds maximum duration. Please use shorter text."

	combined_audio = sum(audio_segments)

	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
	combined_audio.export(temp_file.name, format="wav")
	return temp_file.name, None
	except Exception as e:
	return None, f"Error generating audio: {str(e)}"

	def update_max_duration(file, text_input):
	if file:
	text = extract_text_from_file(file)
	else:
	text = text_input

	if not text:
	return gr.Slider.update(value=60)

	estimated_duration = len(text.split()) / 3 # Rough estimate: 3 words per second
	return gr.Slider.update(value=min(300, max(60, estimated_duration)))

	# Gradio interface
	css = """
	.container {
	max-width: 850px;
	margin: auto;
	padding: 20px;
	background-color: #f0f4f8;
	border-radius: 12px;
	box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
	}
	.input-area, .output-area {
	background-color: white;
	padding: 25px;
	border-radius: 8px;
	box-shadow: 0 2px 4px rgba(0,0,0,0.05);
	margin-bottom: 20px;
	}
	.generate-btn {
	background-color: #4CAF50 !important;
	color: white !important;
	padding: 10px 20px !important;
	font-size: 16px !important;
	font-weight: bold !important;
	border-radius: 5px !important;
	border: none !important;
	cursor: pointer !important;
	transition: background-color 0.3s !important;
	}
	.generate-btn:hover {
	background-color: #45a049 !important;
	}
	"""

	with gr.Blocks(css=css) as demo:
	gr.Markdown("# 🎙️ Parler TTS: Advanced Text-to-Speech Generator")

	with gr.Row(elem_classes="container"):
	with gr.Column(elem_classes="input-area"):
	file_input = gr.File(label="📄 Upload File (TXT, DOCX, PDF)")
	text_input = gr.Textbox(label="✍️ Or enter text here", lines=5, placeholder="Type or paste your text here...")
	description = gr.Textbox(
	label="🗣️ Voice Description",
	lines=2,
	value="A clear, neutral voice with minimal background noise.",
	placeholder="Describe the voice characteristics you want..."
	)
	max_duration = gr.Slider(
	minimum=10,
	maximum=300,
	value=60,
	step=10,
	label="⏱️ Maximum Audio Duration (seconds)"
	)
	submit_btn = gr.Button("🚀 Generate Audio", elem_classes="generate-btn")

	with gr.Column(elem_classes="output-area"):
	output_audio = gr.Audio(label="🔊 Generated Audio")
	error_output = gr.Markdown()

	file_input.change(
	fn=update_max_duration,
	inputs=[file_input, text_input],
	outputs=[max_duration]
	)
	text_input.change(
	fn=update_max_duration,
	inputs=[file_input, text_input],
	outputs=[max_duration]
	)
	submit_btn.click(
	fn=process_input,
	inputs=[file_input, text_input, description, max_duration],
	outputs=[output_audio, error_output]
	)

	gr.Markdown(
	"""
	## 📌 Tips for Best Results
	- For longer texts, the generator will create audio up to the specified maximum duration.
	- Experiment with different voice descriptions to achieve the desired output.
	- Use punctuation to control pacing and intonation in the generated speech.
	- For optimal quality, try to keep individual sentences or paragraphs concise.

	## 🛠️ Technical Details
	- This demo uses the Parler TTS Mini v1 model.
	- Audio generation is GPU-accelerated for faster processing.
	- Maximum file size for uploads: 5MB
	"""
	)

	demo.queue()
	demo.launch()