|
from typing import Dict, Optional, Union |
|
|
|
import numpy as np |
|
|
|
from .generation import codec_decode, generate_coarse, generate_fine, generate_text_semantic |
|
|
|
|
|
def generate_with_settings(text_prompt, semantic_temp=0.6, eos_p=0.2, coarse_temp=0.7, fine_temp=0.5, voice_name=None, output_full=False): |
|
|
|
|
|
x_semantic = generate_text_semantic( |
|
text_prompt, |
|
history_prompt=voice_name, |
|
temp=semantic_temp, |
|
min_eos_p = eos_p, |
|
use_kv_caching=True |
|
) |
|
|
|
x_coarse_gen = generate_coarse( |
|
x_semantic, |
|
history_prompt=voice_name, |
|
temp=coarse_temp, |
|
use_kv_caching=True |
|
) |
|
x_fine_gen = generate_fine( |
|
x_coarse_gen, |
|
history_prompt=voice_name, |
|
temp=fine_temp, |
|
) |
|
|
|
if output_full: |
|
full_generation = { |
|
'semantic_prompt': x_semantic, |
|
'coarse_prompt': x_coarse_gen, |
|
'fine_prompt': x_fine_gen |
|
} |
|
return full_generation, codec_decode(x_fine_gen) |
|
return codec_decode(x_fine_gen) |
|
|
|
|
|
def text_to_semantic( |
|
text: str, |
|
history_prompt: Optional[Union[Dict, str]] = None, |
|
temp: float = 0.7, |
|
silent: bool = False, |
|
): |
|
"""Generate semantic array from text. |
|
|
|
Args: |
|
text: text to be turned into audio |
|
history_prompt: history choice for audio cloning |
|
temp: generation temperature (1.0 more diverse, 0.0 more conservative) |
|
silent: disable progress bar |
|
|
|
Returns: |
|
numpy semantic array to be fed into `semantic_to_waveform` |
|
""" |
|
x_semantic = generate_text_semantic( |
|
text, |
|
history_prompt=history_prompt, |
|
temp=temp, |
|
silent=silent, |
|
use_kv_caching=True |
|
) |
|
return x_semantic |
|
|
|
|
|
def semantic_to_waveform( |
|
semantic_tokens: np.ndarray, |
|
history_prompt: Optional[Union[Dict, str]] = None, |
|
temp: float = 0.7, |
|
silent: bool = False, |
|
output_full: bool = False, |
|
): |
|
"""Generate audio array from semantic input. |
|
|
|
Args: |
|
semantic_tokens: semantic token output from `text_to_semantic` |
|
history_prompt: history choice for audio cloning |
|
temp: generation temperature (1.0 more diverse, 0.0 more conservative) |
|
silent: disable progress bar |
|
output_full: return full generation to be used as a history prompt |
|
|
|
Returns: |
|
numpy audio array at sample frequency 24khz |
|
""" |
|
coarse_tokens = generate_coarse( |
|
semantic_tokens, |
|
history_prompt=history_prompt, |
|
temp=temp, |
|
silent=silent, |
|
use_kv_caching=True |
|
) |
|
fine_tokens = generate_fine( |
|
coarse_tokens, |
|
history_prompt=history_prompt, |
|
temp=0.5, |
|
) |
|
audio_arr = codec_decode(fine_tokens) |
|
if output_full: |
|
full_generation = { |
|
"semantic_prompt": semantic_tokens, |
|
"coarse_prompt": coarse_tokens, |
|
"fine_prompt": fine_tokens, |
|
} |
|
return full_generation, audio_arr |
|
return audio_arr |
|
|
|
|
|
def save_as_prompt(filepath, full_generation): |
|
assert(filepath.endswith(".npz")) |
|
assert(isinstance(full_generation, dict)) |
|
assert("semantic_prompt" in full_generation) |
|
assert("coarse_prompt" in full_generation) |
|
assert("fine_prompt" in full_generation) |
|
np.savez(filepath, **full_generation) |
|
|
|
|
|
def generate_audio( |
|
text: str, |
|
history_prompt: Optional[Union[Dict, str]] = None, |
|
text_temp: float = 0.7, |
|
waveform_temp: float = 0.7, |
|
silent: bool = False, |
|
output_full: bool = False, |
|
): |
|
"""Generate audio array from input text. |
|
|
|
Args: |
|
text: text to be turned into audio |
|
history_prompt: history choice for audio cloning |
|
text_temp: generation temperature (1.0 more diverse, 0.0 more conservative) |
|
waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative) |
|
silent: disable progress bar |
|
output_full: return full generation to be used as a history prompt |
|
|
|
Returns: |
|
numpy audio array at sample frequency 24khz |
|
""" |
|
semantic_tokens = text_to_semantic( |
|
text, |
|
history_prompt=history_prompt, |
|
temp=text_temp, |
|
silent=silent, |
|
) |
|
out = semantic_to_waveform( |
|
semantic_tokens, |
|
history_prompt=history_prompt, |
|
temp=waveform_temp, |
|
silent=silent, |
|
output_full=output_full, |
|
) |
|
if output_full: |
|
full_generation, audio_arr = out |
|
return full_generation, audio_arr |
|
else: |
|
audio_arr = out |
|
return audio_arr |
|
|