|
import numpy as np |
|
from fastapi import HTTPException |
|
|
|
from modules.api.impl.handler.AudioHandler import AudioHandler |
|
from modules.api.impl.model.audio_model import AdjustConfig |
|
from modules.api.impl.model.chattts_model import InferConfig |
|
from modules.api.impl.model.enhancer_model import EnhancerConfig |
|
from modules.Enhancer.ResembleEnhance import apply_audio_enhance_full |
|
from modules.normalization import text_normalize |
|
from modules.ssml_parser.SSMLParser import create_ssml_parser |
|
from modules.SynthesizeSegments import SynthesizeSegments, combine_audio_segments |
|
from modules.utils import audio |
|
|
|
|
|
class SSMLHandler(AudioHandler): |
|
def __init__( |
|
self, |
|
ssml_content: str, |
|
infer_config: InferConfig, |
|
adjust_config: AdjustConfig, |
|
enhancer_config: EnhancerConfig, |
|
) -> None: |
|
assert isinstance(ssml_content, str), "ssml_content must be a string." |
|
assert isinstance( |
|
infer_config, InferConfig |
|
), "infer_config must be an InferConfig object." |
|
assert isinstance( |
|
adjust_config, AdjustConfig |
|
), "adjest_config should be AdjustConfig" |
|
assert isinstance( |
|
enhancer_config, EnhancerConfig |
|
), "enhancer_config must be an EnhancerConfig object." |
|
|
|
self.ssml_content = ssml_content |
|
self.infer_config = infer_config |
|
self.adjest_config = adjust_config |
|
self.enhancer_config = enhancer_config |
|
|
|
self.validate() |
|
|
|
def validate(self): |
|
|
|
pass |
|
|
|
def enqueue(self) -> tuple[np.ndarray, int]: |
|
ssml_content = self.ssml_content |
|
infer_config = self.infer_config |
|
adjust_config = self.adjest_config |
|
enhancer_config = self.enhancer_config |
|
|
|
parser = create_ssml_parser() |
|
segments = parser.parse(ssml_content) |
|
for seg in segments: |
|
seg["text"] = text_normalize(seg["text"], is_end=True) |
|
|
|
if len(segments) == 0: |
|
raise HTTPException( |
|
status_code=422, detail="The SSML text is empty or parsing failed." |
|
) |
|
|
|
synthesize = SynthesizeSegments( |
|
batch_size=infer_config.batch_size, |
|
eos=infer_config.eos, |
|
spliter_thr=infer_config.spliter_threshold, |
|
) |
|
audio_segments = synthesize.synthesize_segments(segments) |
|
combined_audio = combine_audio_segments(audio_segments) |
|
|
|
sample_rate, audio_data = audio.pydub_to_np(combined_audio) |
|
|
|
if enhancer_config.enabled: |
|
nfe = enhancer_config.nfe |
|
solver = enhancer_config.solver |
|
lambd = enhancer_config.lambd |
|
tau = enhancer_config.tau |
|
|
|
audio_data, sample_rate = apply_audio_enhance_full( |
|
audio_data=audio_data, |
|
sr=sample_rate, |
|
nfe=nfe, |
|
solver=solver, |
|
lambd=lambd, |
|
tau=tau, |
|
) |
|
|
|
audio_data = audio.apply_prosody_to_audio_data( |
|
audio_data=audio_data, |
|
rate=adjust_config.speed_rate, |
|
pitch=adjust_config.pitch, |
|
volume=adjust_config.volume_gain_db, |
|
sr=sample_rate, |
|
) |
|
|
|
return audio_data, sample_rate |
|
|