chattts

Running

File size: 5,369 Bytes

d2b7e94
01e655b
d2b7e94
 
01e655b
 
 
d2b7e94
 
d5d0921
 
 
 
c5458aa
d2b7e94
c5458aa
01e655b
 
 
 
 
d5d0921
02e90e4
c5458aa
d5d0921
c5458aa
d5d0921
 
 
01e655b
02e90e4
01e655b
 
 
1df74c6
 
01e655b
d5d0921
 
 
01e655b
 
 
 
 
 
c5458aa
 
 
 
1df74c6
d5d0921
 
c5458aa
d5d0921
 
 
 
 
c5458aa
 
 
 
 
 
 
 
 
01e655b
c5458aa
 
 
 
01e655b
d5d0921
01e655b
d5d0921
 
 
01e655b
d5d0921
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01e655b
d5d0921
01e655b
d5d0921
 
 
 
01e655b
 
 
 
 
ebc4336
 
 
 
 
01e655b
 
f34bda5
 
 
 
 
 
c5458aa
f34bda5
 
 
 
 
 
 
 
 
 
 
c5458aa
f34bda5
 
 
 
02e90e4
 
 
 
 
 
 
 
 
 
 
 
 
f34bda5
 
 
c5458aa
 
f34bda5
 
 
 
 
 
 
 
 
 
 
c5458aa

from typing import List, Optional

from fastapi import Body, File, Form, HTTPException, UploadFile
from fastapi.responses import StreamingResponse
from numpy import clip
from pydantic import BaseModel, Field

from modules.api import utils as api_utils
from modules.api.Api import APIManager
from modules.api.impl.handler.TTSHandler import TTSHandler
from modules.api.impl.model.audio_model import AdjustConfig, AudioFormat
from modules.api.impl.model.chattts_model import ChatTTSConfig, InferConfig
from modules.api.impl.model.enhancer_model import EnhancerConfig
from modules.data import styles_mgr
from modules.speaker import Speaker, speaker_mgr


class AudioSpeechRequest(BaseModel):
    input: str  # 需要合成的文本
    model: str = "chattts-4w"
    voice: str = "female2"
    response_format: AudioFormat = "mp3"
    speed: float = Field(1, ge=0.1, le=10, description="Speed of the audio")
    seed: int = 42

    temperature: float = 0.3
    top_k: int = 20
    top_p: float = 0.7

    style: str = ""
    batch_size: int = Field(1, ge=1, le=20, description="Batch size")
    spliter_threshold: float = Field(
        100, ge=10, le=1024, description="Threshold for sentence spliter"
    )
    # end of sentence
    eos: str = "[uv_break]"

    enhance: bool = False
    denoise: bool = False


async def openai_speech_api(
    request: AudioSpeechRequest = Body(
        ..., description="JSON body with model, input text, and voice"
    )
):
    model = request.model
    input_text = request.input
    voice = request.voice
    style = request.style
    eos = request.eos
    seed = request.seed

    response_format = request.response_format
    if not isinstance(response_format, AudioFormat) and isinstance(
        response_format, str
    ):
        response_format = AudioFormat(response_format)

    batch_size = request.batch_size
    spliter_threshold = request.spliter_threshold
    speed = request.speed
    speed = clip(speed, 0.1, 10)

    if not input_text:
        raise HTTPException(status_code=400, detail="Input text is required.")
    if speaker_mgr.get_speaker(voice) is None:
        raise HTTPException(status_code=400, detail="Invalid voice.")
    try:
        if style:
            styles_mgr.find_item_by_name(style)
    except:
        raise HTTPException(status_code=400, detail="Invalid style.")

    ctx_params = api_utils.calc_spk_style(spk=voice, style=style)

    speaker = ctx_params.get("spk")
    if not isinstance(speaker, Speaker):
        raise HTTPException(status_code=400, detail="Invalid voice.")

    tts_config = ChatTTSConfig(
        style=style,
        temperature=request.temperature,
        top_k=request.top_k,
        top_p=request.top_p,
    )
    infer_config = InferConfig(
        batch_size=batch_size,
        spliter_threshold=spliter_threshold,
        eos=eos,
        seed=seed,
    )
    adjust_config = AdjustConfig(speaking_rate=speed)
    enhancer_config = EnhancerConfig(
        enabled=request.enhance or request.denoise or False,
        lambd=0.9 if request.denoise else 0.1,
    )
    try:
        handler = TTSHandler(
            text_content=input_text,
            spk=speaker,
            tts_config=tts_config,
            infer_config=infer_config,
            adjust_config=adjust_config,
            enhancer_config=enhancer_config,
        )

        buffer = handler.enqueue_to_buffer(response_format)

        mime_type = f"audio/{response_format.value}"
        if response_format == AudioFormat.mp3:
            mime_type = "audio/mpeg"
        return StreamingResponse(buffer, media_type=mime_type)

    except Exception as e:
        import logging

        logging.exception(e)

        if isinstance(e, HTTPException):
            raise e
        else:
            raise HTTPException(status_code=500, detail=str(e))


class TranscribeSegment(BaseModel):
    id: int
    seek: float
    start: float
    end: float
    text: str
    tokens: list[int]
    temperature: float
    avg_logprob: float
    compression_ratio: float
    no_speech_prob: float


class TranscriptionsVerboseResponse(BaseModel):
    task: str
    language: str
    duration: float
    text: str
    segments: list[TranscribeSegment]


def setup(app: APIManager):
    app.post(
        "/v1/audio/speech",
        description="""
openai api document: 
[https://platform.openai.com/docs/guides/text-to-speech](https://platform.openai.com/docs/guides/text-to-speech)

以下属性为本系统自定义属性，不在openai文档中：
- batch_size: 是否开启batch合成，小于等于1表示不使用batch （不推荐）
- spliter_threshold: 开启batch合成时，句子分割的阈值
- style: 风格

> model 可填任意值
        """,
    )(openai_speech_api)

    @app.post(
        "/v1/audio/transcriptions",
        response_model=TranscriptionsVerboseResponse,
        description="Transcribes audio into the input language.",
    )
    async def transcribe(
        file: UploadFile = File(...),
        model: str = Form(...),
        language: Optional[str] = Form(None),
        prompt: Optional[str] = Form(None),
        response_format: str = Form("json"),
        temperature: float = Form(0),
        timestamp_granularities: List[str] = Form(["segment"]),
    ):
        # TODO: Implement transcribe
        return api_utils.success_response("not implemented yet")