Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -47,17 +47,16 @@ model.cuda()
|
|
47 |
|
48 |
print("Modelo cargado en GPU")
|
49 |
|
50 |
-
def predict(prompt, language, reference_audio):
|
51 |
try:
|
52 |
if len(prompt) < 2 or len(prompt) > 600:
|
53 |
return None, "El texto debe tener entre 2 y 600 caracteres."
|
54 |
|
55 |
-
#
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
top_p = config.model_args.get("top_p", 0.85)
|
61 |
|
62 |
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
|
63 |
audio_path=reference_audio
|
@@ -70,7 +69,7 @@ def predict(prompt, language, reference_audio):
|
|
70 |
language,
|
71 |
gpt_cond_latent,
|
72 |
speaker_embedding,
|
73 |
-
temperature=temperature,
|
74 |
length_penalty=length_penalty,
|
75 |
repetition_penalty=repetition_penalty,
|
76 |
top_k=top_k,
|
@@ -80,11 +79,9 @@ def predict(prompt, language, reference_audio):
|
|
80 |
inference_time = time.time() - start_time
|
81 |
|
82 |
output_path = "pedro_labattaglia_TTS.wav"
|
83 |
-
# Guardar el audio directamente desde el output del modelo
|
84 |
-
import scipy.io.wavfile as wavfile
|
85 |
wavfile.write(output_path, config.audio["output_sample_rate"], out["wav"])
|
86 |
|
87 |
-
audio_length = len(out["wav"]) / config.audio["output_sample_rate"]
|
88 |
real_time_factor = inference_time / audio_length
|
89 |
|
90 |
metrics_text = f"Tiempo de generación: {inference_time:.2f} segundos\n"
|
@@ -146,17 +143,27 @@ with gr.Blocks(theme=theme) as demo:
|
|
146 |
language_selector = gr.Dropdown(label="Idioma", choices=supported_languages)
|
147 |
reference_audio = gr.Dropdown(label="Audio de referencia", choices=reference_audios)
|
148 |
input_text = gr.Textbox(label="Texto a sintetizar", placeholder="Escribe aquí el texto que quieres convertir a voz...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
generate_button = gr.Button("Generar voz", variant="primary")
|
150 |
|
151 |
with gr.Column(scale=1):
|
152 |
generated_audio = gr.Audio(label="Audio generado", interactive=False)
|
153 |
metrics_output = gr.Textbox(label="Métricas", value="Tiempo de generación: -- segundos\nFactor de tiempo real: --")
|
154 |
|
155 |
-
|
156 |
# Configuración del botón para generar voz
|
157 |
generate_button.click(
|
158 |
predict,
|
159 |
-
inputs=[input_text, language_selector, reference_audio],
|
160 |
outputs=[generated_audio, metrics_output]
|
161 |
)
|
162 |
|
|
|
47 |
|
48 |
print("Modelo cargado en GPU")
|
49 |
|
50 |
+
def predict(prompt, language, reference_audio, temperature):
|
51 |
try:
|
52 |
if len(prompt) < 2 or len(prompt) > 600:
|
53 |
return None, "El texto debe tener entre 2 y 600 caracteres."
|
54 |
|
55 |
+
# Otros parámetros pueden mantenerse como están o ajustarse ligeramente.
|
56 |
+
length_penalty = 1.0
|
57 |
+
repetition_penalty = 2.0
|
58 |
+
top_k = 50
|
59 |
+
top_p = 0.7 # Mayor estabilidad
|
|
|
60 |
|
61 |
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
|
62 |
audio_path=reference_audio
|
|
|
69 |
language,
|
70 |
gpt_cond_latent,
|
71 |
speaker_embedding,
|
72 |
+
temperature=temperature, # Pasar temperatura ajustada
|
73 |
length_penalty=length_penalty,
|
74 |
repetition_penalty=repetition_penalty,
|
75 |
top_k=top_k,
|
|
|
79 |
inference_time = time.time() - start_time
|
80 |
|
81 |
output_path = "pedro_labattaglia_TTS.wav"
|
|
|
|
|
82 |
wavfile.write(output_path, config.audio["output_sample_rate"], out["wav"])
|
83 |
|
84 |
+
audio_length = len(out["wav"]) / config.audio["output_sample_rate"]
|
85 |
real_time_factor = inference_time / audio_length
|
86 |
|
87 |
metrics_text = f"Tiempo de generación: {inference_time:.2f} segundos\n"
|
|
|
143 |
language_selector = gr.Dropdown(label="Idioma", choices=supported_languages)
|
144 |
reference_audio = gr.Dropdown(label="Audio de referencia", choices=reference_audios)
|
145 |
input_text = gr.Textbox(label="Texto a sintetizar", placeholder="Escribe aquí el texto que quieres convertir a voz...")
|
146 |
+
|
147 |
+
# Aquí está el slider de temperatura con una breve explicación
|
148 |
+
temperature_slider = gr.Slider(
|
149 |
+
minimum=0.1,
|
150 |
+
maximum=1.0,
|
151 |
+
value=0.2,
|
152 |
+
step=0.05,
|
153 |
+
label="Temperatura (Estabilidad vs Creatividad)",
|
154 |
+
info="Valores bajos generan una voz más estable pero menos creativa, mientras que valores más altos permiten más variabilidad en la voz."
|
155 |
+
)
|
156 |
+
|
157 |
generate_button = gr.Button("Generar voz", variant="primary")
|
158 |
|
159 |
with gr.Column(scale=1):
|
160 |
generated_audio = gr.Audio(label="Audio generado", interactive=False)
|
161 |
metrics_output = gr.Textbox(label="Métricas", value="Tiempo de generación: -- segundos\nFactor de tiempo real: --")
|
162 |
|
|
|
163 |
# Configuración del botón para generar voz
|
164 |
generate_button.click(
|
165 |
predict,
|
166 |
+
inputs=[input_text, language_selector, reference_audio, temperature_slider], # Se incluye el slider aquí
|
167 |
outputs=[generated_audio, metrics_output]
|
168 |
)
|
169 |
|