Spaces:
Running
Running
File size: 5,369 Bytes
d2b7e94 01e655b d2b7e94 01e655b d2b7e94 d5d0921 c5458aa d2b7e94 c5458aa 01e655b d5d0921 02e90e4 c5458aa d5d0921 c5458aa d5d0921 01e655b 02e90e4 01e655b 1df74c6 01e655b d5d0921 01e655b c5458aa 1df74c6 d5d0921 c5458aa d5d0921 c5458aa 01e655b c5458aa 01e655b d5d0921 01e655b d5d0921 01e655b d5d0921 01e655b d5d0921 01e655b d5d0921 01e655b ebc4336 01e655b f34bda5 c5458aa f34bda5 c5458aa f34bda5 02e90e4 f34bda5 c5458aa f34bda5 c5458aa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
from typing import List, Optional
from fastapi import Body, File, Form, HTTPException, UploadFile
from fastapi.responses import StreamingResponse
from numpy import clip
from pydantic import BaseModel, Field
from modules.api import utils as api_utils
from modules.api.Api import APIManager
from modules.api.impl.handler.TTSHandler import TTSHandler
from modules.api.impl.model.audio_model import AdjustConfig, AudioFormat
from modules.api.impl.model.chattts_model import ChatTTSConfig, InferConfig
from modules.api.impl.model.enhancer_model import EnhancerConfig
from modules.data import styles_mgr
from modules.speaker import Speaker, speaker_mgr
class AudioSpeechRequest(BaseModel):
input: str # 需要合成的文本
model: str = "chattts-4w"
voice: str = "female2"
response_format: AudioFormat = "mp3"
speed: float = Field(1, ge=0.1, le=10, description="Speed of the audio")
seed: int = 42
temperature: float = 0.3
top_k: int = 20
top_p: float = 0.7
style: str = ""
batch_size: int = Field(1, ge=1, le=20, description="Batch size")
spliter_threshold: float = Field(
100, ge=10, le=1024, description="Threshold for sentence spliter"
)
# end of sentence
eos: str = "[uv_break]"
enhance: bool = False
denoise: bool = False
async def openai_speech_api(
request: AudioSpeechRequest = Body(
..., description="JSON body with model, input text, and voice"
)
):
model = request.model
input_text = request.input
voice = request.voice
style = request.style
eos = request.eos
seed = request.seed
response_format = request.response_format
if not isinstance(response_format, AudioFormat) and isinstance(
response_format, str
):
response_format = AudioFormat(response_format)
batch_size = request.batch_size
spliter_threshold = request.spliter_threshold
speed = request.speed
speed = clip(speed, 0.1, 10)
if not input_text:
raise HTTPException(status_code=400, detail="Input text is required.")
if speaker_mgr.get_speaker(voice) is None:
raise HTTPException(status_code=400, detail="Invalid voice.")
try:
if style:
styles_mgr.find_item_by_name(style)
except:
raise HTTPException(status_code=400, detail="Invalid style.")
ctx_params = api_utils.calc_spk_style(spk=voice, style=style)
speaker = ctx_params.get("spk")
if not isinstance(speaker, Speaker):
raise HTTPException(status_code=400, detail="Invalid voice.")
tts_config = ChatTTSConfig(
style=style,
temperature=request.temperature,
top_k=request.top_k,
top_p=request.top_p,
)
infer_config = InferConfig(
batch_size=batch_size,
spliter_threshold=spliter_threshold,
eos=eos,
seed=seed,
)
adjust_config = AdjustConfig(speaking_rate=speed)
enhancer_config = EnhancerConfig(
enabled=request.enhance or request.denoise or False,
lambd=0.9 if request.denoise else 0.1,
)
try:
handler = TTSHandler(
text_content=input_text,
spk=speaker,
tts_config=tts_config,
infer_config=infer_config,
adjust_config=adjust_config,
enhancer_config=enhancer_config,
)
buffer = handler.enqueue_to_buffer(response_format)
mime_type = f"audio/{response_format.value}"
if response_format == AudioFormat.mp3:
mime_type = "audio/mpeg"
return StreamingResponse(buffer, media_type=mime_type)
except Exception as e:
import logging
logging.exception(e)
if isinstance(e, HTTPException):
raise e
else:
raise HTTPException(status_code=500, detail=str(e))
class TranscribeSegment(BaseModel):
id: int
seek: float
start: float
end: float
text: str
tokens: list[int]
temperature: float
avg_logprob: float
compression_ratio: float
no_speech_prob: float
class TranscriptionsVerboseResponse(BaseModel):
task: str
language: str
duration: float
text: str
segments: list[TranscribeSegment]
def setup(app: APIManager):
app.post(
"/v1/audio/speech",
description="""
openai api document:
[https://platform.openai.com/docs/guides/text-to-speech](https://platform.openai.com/docs/guides/text-to-speech)
以下属性为本系统自定义属性,不在openai文档中:
- batch_size: 是否开启batch合成,小于等于1表示不使用batch (不推荐)
- spliter_threshold: 开启batch合成时,句子分割的阈值
- style: 风格
> model 可填任意值
""",
)(openai_speech_api)
@app.post(
"/v1/audio/transcriptions",
response_model=TranscriptionsVerboseResponse,
description="Transcribes audio into the input language.",
)
async def transcribe(
file: UploadFile = File(...),
model: str = Form(...),
language: Optional[str] = Form(None),
prompt: Optional[str] = Form(None),
response_format: str = Form("json"),
temperature: float = Form(0),
timestamp_granularities: List[str] = Form(["segment"]),
):
# TODO: Implement transcribe
return api_utils.success_response("not implemented yet")
|