Spaces:

zhengr
/

ChatTTS

Sleeping

App Files Files Community

zhengr commited on Sep 9

Commit

c02bdcd

•

1 Parent(s): 93e634f

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

ChatTTS/__init__.py +1 -0
ChatTTS/config/__init__.py +1 -0
ChatTTS/config/config.py +134 -0
ChatTTS/core.py +669 -0
ChatTTS/model/__init__.py +6 -0
ChatTTS/model/cuda/__init__.py +1 -0
ChatTTS/model/cuda/patch.py +18 -0
ChatTTS/model/cuda/te_llama.py +192 -0
ChatTTS/model/dvae.py +296 -0
ChatTTS/model/embed.py +81 -0
ChatTTS/model/gpt.py +613 -0
ChatTTS/model/processors.py +58 -0
ChatTTS/model/speaker.py +154 -0
ChatTTS/model/tokenizer.py +138 -0
ChatTTS/model/velocity/__init__.py +2 -0
ChatTTS/model/velocity/block_manager.py +296 -0
ChatTTS/model/velocity/configs.py +865 -0
ChatTTS/model/velocity/llama.py +393 -0
ChatTTS/model/velocity/llm.py +213 -0
ChatTTS/model/velocity/llm_engine.py +833 -0
ChatTTS/model/velocity/model_loader.py +69 -0
ChatTTS/model/velocity/model_runner.py +817 -0
ChatTTS/model/velocity/output.py +144 -0
ChatTTS/model/velocity/sampler.py +120 -0
ChatTTS/model/velocity/sampling_params.py +296 -0
ChatTTS/model/velocity/scheduler.py +426 -0
ChatTTS/model/velocity/sequence.py +450 -0
ChatTTS/model/velocity/worker.py +251 -0
ChatTTS/norm.py +253 -0
ChatTTS/res/__init__.py +0 -0
ChatTTS/res/homophones_map.json +0 -0
ChatTTS/res/sha256_map.json +13 -0
ChatTTS/utils/__init__.py +4 -0
ChatTTS/utils/dl.py +220 -0
ChatTTS/utils/gpu.py +40 -0
ChatTTS/utils/io.py +44 -0
ChatTTS/utils/log.py +16 -0
Dockerfile +13 -0
LICENSE +661 -0
docs/cn/README.md +314 -0
docs/es/README.md +255 -0
docs/fr/README.md +283 -0
docs/jp/README.md +134 -0
docs/ru/README.md +136 -0
examples/__init__.py +0 -0
examples/api/README.md +23 -0
examples/api/client.py +76 -0
examples/api/main.py +107 -0
examples/api/requirements.txt +2 -0
examples/cmd/run.py +151 -0

ChatTTS/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .core import Chat

ChatTTS/config/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .config import Config

ChatTTS/config/config.py ADDED Viewed

	@@ -0,0 +1,134 @@

+from dataclasses import dataclass
+@dataclass(repr=False, eq=False)
+class Path:
+    vocos_ckpt_path: str = "asset/Vocos.pt"
+    dvae_ckpt_path: str = "asset/DVAE_full.pt"
+    gpt_ckpt_path: str = "asset/gpt"
+    decoder_ckpt_path: str = "asset/Decoder.pt"
+    tokenizer_path: str = "asset/tokenizer"
+    embed_path: str = "asset/Embed.safetensors"
+@dataclass(repr=False, eq=False)
+class Decoder:
+    idim: int = 384
+    odim: int = 384
+    hidden: int = 512
+    n_layer: int = 12
+    bn_dim: int = 128
+@dataclass(repr=False, eq=False)
+class VQ:
+    dim: int = 1024
+    levels: tuple = (5, 5, 5, 5)
+    G: int = 2
+    R: int = 2
+@dataclass(repr=False, eq=False)
+class DVAE:
+    encoder: Decoder = Decoder(
+        idim=512,
+        odim=1024,
+        hidden=256,
+        n_layer=12,
+        bn_dim=128,
+    )
+    decoder: Decoder = Decoder(
+        idim=512,
+        odim=512,
+        hidden=256,
+        n_layer=12,
+        bn_dim=128,
+    )
+    vq: VQ = VQ()
+@dataclass(repr=False, eq=False)
+class GPT:
+    hidden_size: int = 768
+    intermediate_size: int = 3072
+    num_attention_heads: int = 12
+    num_hidden_layers: int = 20
+    use_cache: bool = False
+    max_position_embeddings: int = 4096
+    spk_emb_dim: int = 192
+    spk_KL: bool = False
+    num_audio_tokens: int = 626
+    num_text_tokens: int = 21178
+    num_vq: int = 4
+@dataclass(repr=False, eq=False)
+class Embed:
+    hidden_size: int = 768
+    num_audio_tokens: int = 626
+    num_text_tokens: int = 21178
+    num_vq: int = 4
+@dataclass(repr=False, eq=False)
+class FeatureExtractorInitArgs:
+    sample_rate: int = 24000
+    n_fft: int = 1024
+    hop_length: int = 256
+    n_mels: int = 100
+    padding: str = "center"
+@dataclass(repr=False, eq=False)
+class FeatureExtractor:
+    class_path: str = "vocos.feature_extractors.MelSpectrogramFeatures"
+    init_args: FeatureExtractorInitArgs = FeatureExtractorInitArgs()
+@dataclass(repr=False, eq=False)
+class BackboneInitArgs:
+    input_channels: int = 100
+    dim: int = 512
+    intermediate_dim: int = 1536
+    num_layers: int = 8
+@dataclass(repr=False, eq=False)
+class Backbone:
+    class_path: str = "vocos.models.VocosBackbone"
+    init_args: BackboneInitArgs = BackboneInitArgs()
+@dataclass(repr=False, eq=False)
+class FourierHeadInitArgs:
+    dim: int = 512
+    n_fft: int = 1024
+    hop_length: int = 256
+    padding: str = "center"
+@dataclass(repr=False, eq=False)
+class FourierHead:
+    class_path: str = "vocos.heads.ISTFTHead"
+    init_args: FourierHeadInitArgs = FourierHeadInitArgs()
+@dataclass(repr=False, eq=False)
+class Vocos:
+    feature_extractor: FeatureExtractor = FeatureExtractor()
+    backbone: Backbone = Backbone()
+    head: FourierHead = FourierHead()
+@dataclass(repr=False, eq=False)
+class Config:
+    path: Path = Path()
+    decoder: Decoder = Decoder()
+    dvae: DVAE = DVAE()
+    gpt: GPT = GPT()
+    embed: Embed = Embed()
+    vocos: Vocos = Vocos()
+    spk_stat: str = (
+        "愐穤巩噅廷戇笉屈癐媄垹垧帶爲漈塀殐慄亅倴庲舴猂瑈圐狴夥圓帍戛挠腉耐劤坽喳幾战謇聀崒栄呥倸庭燡欈杁襐褄乭埗幺爃弔摁斐捔兕佖廐舏竾豃磐姓趡佄幒爚欄豄讐皳訵仩帆投謌荃蝐叄圝伆幦抂茁呄掑斃讹傮庞爣蜀橁偐祄亥兡常爂欍扉丐浔佱僈強払伅扂蛐徴憍傞巀戺欀艂琐嗴啥値彷刂權穈扒卤俔贲庛初笂卄贐枴仭亁庛剎猢扃缐趤刁偵幪舏伌煁婐潤晍位弾舙茥穁葏蠣訑企庤刊笍橁溑僔云偁庯戚伍潉膐脴僵噔廃艅匊祂唐憴壝嗙席爥欁虁谐牴帽势弿牳蜁兀蛐傄喩丿帔刔圆衁廐罤庁促帙劢伈汄樐檄勵伴弝舑欍罅虐昴劭勅帜刼朊蕁虐蓴樑伫幨扑謪剀堐稴丵伱弐舮諸赁習俔容厱幫牶謃孄糐答嗝僊帜燲笄終瀒判久僤帘爴茇千孑冄凕佳引扐蜁歁缏裄剽儺恘爋朏眿廐呄塍嘇幻爱茠詁訐剴唭俐幾戊欀硁菐贄楕偒巡爀弎屄莐睳賙凶彎刅漄區唐溴剑劋庽舽猄煃跐夔惥伾庮舎伈罁垑坄怅业怯刁朇獁嶏覔坩俳巶爜朐潁崐萄俹凛常爺笌穀聐此夡倛帡刀匉終窏舣販侽怿扉伥贿憐忓謩姆幌犊漂慆癒却甝兎帼戏欅詂浐朔仹壭帰臷弎恇菐獤帡偖帘爞伅腂皐纤囅充幓戠伥灂丐訤戱倱弋爮嬌癁恐孄侥劬忶刓國詀桒古偩嘄庬戚茝赂监燤嘑勌幦舽持呂諐棤姑再底舡笍艃瀐孴倉傔弋爔猠乁濑塄偽嘧恂舛缇襃厐窴仡刱忕別漇穁岏缴廽价庌爊謈硄讑惤倁儂庭爋伇蝂嶐莔摝傠库刞茄歃戏薤伍伯廮创笠塄熐兴勽俄帅剉最腀砐敤卝侍弆戺朒虃旐蚄梕亖幔牻朣扅贐玔堝噅帡剌圅摀崐彤流僳庙爖嬇啁渐悤堁丛幆刧挜彃悐幤刹嚟恕芁看聀摐焔向乁帖爭欁癃糒圄弙佱廜戤謍婀咐昴焍亩廦艏拼謿芐癤怹兽幸舳朇畁喐稔毝丼弈懲挀譂勑哴啁伎常舭笯晁堑俄叩剔廟爍欦絁夒伤休傑廳戌蜅潆癐彴摑勯床刽欅艁砐忄搉从廡舊猥潂唐委仱僜廼爤朄呃弐礔滵垓幩爄挂筁乐籤刕凟幵爠弉���乑吴勥伖帪舩茆婁碐幤叭乢巜艳猁桀桐啄唩俊幍舮猀艅焐螔琽亀帋爜缅噃咐斤喩予幩爛笆摀浐猴依侹幃刕園慄蛐栤澹仑座爼謉桃慐浔斕偻幛懰嬓衁愐氄悅仿应芔漄衃敐謤傁匩幹抃圉癄廐裄屵噉幍利謍聂搐蛔嚙坍怗舁圐畃膐栄刵东巆戤諾呃偑媤嗨跞忶爝眄祂朒嶔僭劉忾刐匋癄袐翴珅僷廲芄茈恈皐擄崑伄廉牍匃剃犏澤唑丄庺戃伃煀某杄偙亽帴切缌罄挐尴噙倰带舞漄橄塐糴俩僯帀般漀坂栐更両俇廱舌猁慂拐偤嶱卶应刪眉獁茐伔嘅偺帟舊漂恀栐暄喡乞庙舆匂敀潑恔劑侖延戦盽怶唯慳蝘蟃孫娎益袰玍屃痶翮笪儚裀倹椌玻翀詵筽舘惯堿某侰晈藏缮詗廦夸妎瑻瀒裔媀憞唃冶璭狻渠荑奬熹茅愺氰菣滠翦岓褌泣崲嚭欓湒聙宺爄蛅愸庍匃帆誔穮懌蓪玷澌氋抌訙屌臞廛玸听屺希疭孝凂紋新煎彃膲跱尪懁眆窴珏卓揨菸紭概囥显壌榄垫嘮嬭覤媸侵佮烒耸觌婀秋狃帹葯訤桜糨笾腢伀肶悍炂艤禖岅臺惘梷瞍友盁佨岧憳瓧嘴汬藊愌蘤嶠硴绤蜲襏括勾谂縨妥蓪澭竭萢藜纞糲煮愆瀯孯琓罂諺塿燗狟弙衯揻縷丱糅臄梱瀮杰巳猙亊符胠匃泀廏圃膂蒃籏礩岈簹缌劺燲褡孓膜拔蠿觮呋煣厌尷熜論弲牭紫寊誃紀橴賬傸箍弚窃侫簲慯烣渽祌壓媥噜夽夛諛玹疮禄冪謇媽衤盰缺繑薫兾萧嵱打滽箺嚯凣狢蠜崼覽烸簶盯籓摀苶峸懗泲涻凮愳緗剋笔懆廡瞿椏礤惐藥崍腈烄伹亯昣翬褍絋桫僨吨莌丛矄蜞娈憊苆塁蓏嚢嫼绻崱婋囱蠸篯晣芀繼索兓僖誹岯圪褰蠇唓妷胅巁渮砛傈蝷嵚冃購赁峍裋荂舾符熻岳墩寮粃凲袑彚太绲头摯繳狁俥籌冝諝註坎幫擤詒宒凕賐唶梎噔弼課屿覍囨焬櫱撪蝮蝬簸懰櫫涺嵍睻屪翔峞慘滟熲昱军烊舿尦舄糖奁溏凂彆蝲糴禍困皻灏牋睒诙嶱臀开蓈眎腼丢纻廏憤嫖暭袭崲肸螛妒榗紉谨窮袃瑠聍绊腆亿冲葐喋縔詖岑兾给堸赏旻桀蛨媆訂峦紷敯囬偐筨岸焸拭笵殒哜墒萍屓娓諙械臮望摰芑寭准僞谹氍旋憢菮屃划欣瘫谎蘻哐繁籥禦僿誵皯墓燀縿笞熦绗稹榎矻綞蓓帡戓沺区才畃洊詪糐裶盰窶耎偌劂誐庩惝滜沺哮呃煐譠崄槀猄肼蔐擋湌蠺篃恥諌瞦宍堫挪裕崑慩狲悠煋仛愞砈粵八棁害楐妋萔貨尵奂苰怫誎傫岆蕯屇脉夈仆茎刓繸芺壸碗曛汁戭炻獻凉媁兎狜爴怰賃纎袏娷禃蓥膹薪渻罸窿粫凾褄舺窮墫干苊繁冏僮訸夯绛蓪虛羽慲烏憷趎睊蠰莍塞成廎盁欏喓蜮譤崆楁囘矇薭伣艘虝帴奮苢渶虎暣翐蝃尾稈糶瀴罐嵚氮葯笫慐棌悶炯竻爅们媡姢嫺窷刮歫劈裩屬椕賑蜹薊刲義哯尗褦瓀稾礋揣窼舫尋姁椄侸嗫珺修纘媃腽蛛稹梭呛瀈蘟縀礉論夵售主梮蠉娅娭裀誼嶭観枳倊簈褃擞綿催瞃溶苊笛襹櫲盅六囫獩佃粨慯瓢眸旱荃婨蔞岋祗墼焻网牻琖詆峋秉胳媴袭澓賢経稟壩胫碯偏囫嶎纆窈槊賐撹璬莃缘誾宭愊眗喷监劋萘訯總槿棭戾墮犄恌縈簍樥蛔杁袭嫛憫倆篏墵賈羯茎觳蒜致娢慄勒覸蘍曲栂葭宆妋皽缽免盳猼蔂糥觧烳檸佯憓煶蔐筼种繷琲膌塄剰讎対腕棥渽忲俛浪譬秛惛壒嘸淫冻曄睻砃奫貯庴爅粓脮脡娎妖峵蘲討惋泊蠀㴆"
+    )

ChatTTS/core.py ADDED Viewed

	@@ -0,0 +1,669 @@

+import os
+import logging
+import tempfile
+from dataclasses import dataclass, asdict
+from typing import Literal, Optional, List, Tuple, Dict, Union
+from json import load
+from pathlib import Path
+import numpy as np
+import torch
+from vocos import Vocos
+from vocos.pretrained import instantiate_class
+from huggingface_hub import snapshot_download
+from .config import Config
+from .model import DVAE, Embed, GPT, gen_logits, Tokenizer, Speaker
+from .utils import (
+    check_all_assets,
+    download_all_assets,
+    select_device,
+    get_latest_modified_file,
+    del_all,
+)
+from .utils import logger as utils_logger
+from .norm import Normalizer
+class Chat:
+    def __init__(self, logger=logging.getLogger(__name__)):
+        self.logger = logger
+        utils_logger.set_logger(logger)
+        self.config = Config()
+        self.normalizer = Normalizer(
+            os.path.join(os.path.dirname(__file__), "res", "homophones_map.json"),
+            logger,
+        )
+        with open(
+            os.path.join(os.path.dirname(__file__), "res", "sha256_map.json")
+        ) as f:
+            self.sha256_map: Dict[str, str] = load(f)
+        self.context = GPT.Context()
+    def has_loaded(self, use_decoder=False):
+        not_finish = False
+        check_list = ["vocos", "gpt", "tokenizer", "embed"]
+        if use_decoder:
+            check_list.append("decoder")
+        else:
+            check_list.append("dvae")
+        for module in check_list:
+            if not hasattr(self, module):
+                self.logger.warning(f"{module} not initialized.")
+                not_finish = True
+        return not not_finish
+    def download_models(
+        self,
+        source: Literal["huggingface", "local", "custom"] = "local",
+        force_redownload=False,
+        custom_path: Optional[torch.serialization.FILE_LIKE] = None,
+    ) -> Optional[str]:
+        if source == "local":
+            download_path = os.getcwd()
+            if (
+                not check_all_assets(Path(download_path), self.sha256_map, update=True)
+                or force_redownload
+            ):
+                with tempfile.TemporaryDirectory() as tmp:
+                    download_all_assets(tmpdir=tmp)
+                if not check_all_assets(
+                    Path(download_path), self.sha256_map, update=False
+                ):
+                    self.logger.error(
+                        "download to local path %s failed.", download_path
+                    )
+                    return None
+        elif source == "huggingface":
+            hf_home = os.getenv("HF_HOME", os.path.expanduser("~/.cache/huggingface"))
+            try:
+                download_path = get_latest_modified_file(
+                    os.path.join(hf_home, "hub/models--2Noise--ChatTTS/snapshots")
+                )
+            except:
+                download_path = None
+            if download_path is None or force_redownload:
+                self.logger.log(
+                    logging.INFO,
+                    f"download from HF: https://huggingface.co/2Noise/ChatTTS",
+                )
+                try:
+                    download_path = snapshot_download(
+                        repo_id="2Noise/ChatTTS",
+                        allow_patterns=["*.pt", "*.yaml", "*.json", "*.safetensors"],
+                    )
+                except:
+                    download_path = None
+            else:
+                self.logger.log(
+                    logging.INFO, f"load latest snapshot from cache: {download_path}"
+                )
+            if download_path is None:
+                self.logger.error("download from huggingface failed.")
+                return None
+        elif source == "custom":
+            self.logger.log(logging.INFO, f"try to load from local: {custom_path}")
+            if not check_all_assets(Path(custom_path), self.sha256_map, update=False):
+                self.logger.error("check models in custom path %s failed.", custom_path)
+                return None
+            download_path = custom_path
+        return download_path
+    def load(
+        self,
+        source: Literal["huggingface", "local", "custom"] = "local",
+        force_redownload=False,
+        compile: bool = False,
+        custom_path: Optional[torch.serialization.FILE_LIKE] = None,
+        device: Optional[torch.device] = None,
+        coef: Optional[torch.Tensor] = None,
+        use_flash_attn=False,
+        use_vllm=False,
+        experimental: bool = False,
+    ) -> bool:
+        download_path = self.download_models(source, force_redownload, custom_path)
+        if download_path is None:
+            return False
+        return self._load(
+            device=device,
+            compile=compile,
+            coef=coef,
+            use_flash_attn=use_flash_attn,
+            use_vllm=use_vllm,
+            experimental=experimental,
+            **{
+                k: os.path.join(download_path, v)
+                for k, v in asdict(self.config.path).items()
+            },
+        )
+    def unload(self):
+        logger = self.logger
+        self.normalizer.destroy()
+        del self.normalizer
+        del self.sha256_map
+        del_list = ["vocos", "gpt", "decoder", "dvae", "tokenizer", "embed"]
+        for module in del_list:
+            if hasattr(self, module):
+                delattr(self, module)
+        self.__init__(logger)
+    def sample_random_speaker(self) -> str:
+        return self.speaker.sample_random()
+    def sample_audio_speaker(self, wav: Union[np.ndarray, torch.Tensor]) -> str:
+        return self.speaker.encode_prompt(self.dvae.sample_audio(wav))
+    @dataclass(repr=False, eq=False)
+    class RefineTextParams:
+        prompt: str = ""
+        top_P: float = 0.7
+        top_K: int = 20
+        temperature: float = 0.7
+        repetition_penalty: float = 1.0
+        max_new_token: int = 384
+        min_new_token: int = 0
+        show_tqdm: bool = True
+        ensure_non_empty: bool = True
+        manual_seed: Optional[int] = None
+    @dataclass(repr=False, eq=False)
+    class InferCodeParams(RefineTextParams):
+        prompt: str = "[speed_5]"
+        spk_emb: Optional[str] = None
+        spk_smp: Optional[str] = None
+        txt_smp: Optional[str] = None
+        temperature: float = 0.3
+        repetition_penalty: float = 1.05
+        max_new_token: int = 2048
+        stream_batch: int = 24
+        stream_speed: int = 12000
+        pass_first_n_batches: int = 2
+    def infer(
+        self,
+        text,
+        stream=False,
+        lang=None,
+        skip_refine_text=False,
+        refine_text_only=False,
+        use_decoder=True,
+        do_text_normalization=True,
+        do_homophone_replacement=True,
+        params_refine_text=RefineTextParams(),
+        params_infer_code=InferCodeParams(),
+    ):
+        self.context.set(False)
+        res_gen = self._infer(
+            text,
+            stream,
+            lang,
+            skip_refine_text,
+            refine_text_only,
+            use_decoder,
+            do_text_normalization,
+            do_homophone_replacement,
+            params_refine_text,
+            params_infer_code,
+        )
+        if stream:
+            return res_gen
+        else:
+            return next(res_gen)
+    def interrupt(self):
+        self.context.set(True)
+    @torch.no_grad()
+    def _load(
+        self,
+        vocos_ckpt_path: str = None,
+        dvae_ckpt_path: str = None,
+        gpt_ckpt_path: str = None,
+        embed_path: str = None,
+        decoder_ckpt_path: str = None,
+        tokenizer_path: str = None,
+        device: Optional[torch.device] = None,
+        compile: bool = False,
+        coef: Optional[str] = None,
+        use_flash_attn=False,
+        use_vllm=False,
+        experimental: bool = False,
+    ):
+        if device is None:
+            device = select_device(experimental=experimental)
+            self.logger.info("use device %s", str(device))
+        self.device = device
+        self.device_gpt = device if "mps" not in str(device) else torch.device("cpu")
+        self.compile = compile
+        feature_extractor = instantiate_class(
+            args=(), init=asdict(self.config.vocos.feature_extractor)
+        )
+        backbone = instantiate_class(args=(), init=asdict(self.config.vocos.backbone))
+        head = instantiate_class(args=(), init=asdict(self.config.vocos.head))
+        vocos = (
+            Vocos(feature_extractor=feature_extractor, backbone=backbone, head=head)
+            .to(
+                # vocos on mps will crash, use cpu fallback
+                "cpu"
+                if "mps" in str(device)
+                else device
+            )
+            .eval()
+        )
+        assert vocos_ckpt_path, "vocos_ckpt_path should not be None"
+        vocos.load_state_dict(torch.load(vocos_ckpt_path, weights_only=True, mmap=True))
+        self.vocos = vocos
+        self.logger.log(logging.INFO, "vocos loaded.")
+        dvae = (
+            DVAE(
+                decoder_config=asdict(self.config.dvae.decoder),
+                encoder_config=asdict(self.config.dvae.encoder),
+                vq_config=asdict(self.config.dvae.vq),
+                dim=self.config.dvae.decoder.idim,
+                coef=coef,
+                device=device,
+            )
+            .to(device)
+            .eval()
+        )
+        coef = str(dvae)
+        assert dvae_ckpt_path, "dvae_ckpt_path should not be None"
+        dvae.load_state_dict(torch.load(dvae_ckpt_path, weights_only=True, mmap=True))
+        self.dvae = dvae
+        self.logger.log(logging.INFO, "dvae loaded.")
+        embed = Embed(
+            self.config.embed.hidden_size,
+            self.config.embed.num_audio_tokens,
+            self.config.embed.num_text_tokens,
+            self.config.embed.num_vq,
+        )
+        embed.from_pretrained(embed_path, device=device)
+        self.embed = embed.to(device)
+        self.logger.log(logging.INFO, "embed loaded.")
+        gpt = GPT(
+            gpt_config=asdict(self.config.gpt),
+            embed=self.embed,
+            use_flash_attn=use_flash_attn,
+            use_vllm=use_vllm,
+            device=device,
+            device_gpt=self.device_gpt,
+            logger=self.logger,
+        ).eval()
+        assert gpt_ckpt_path, "gpt_ckpt_path should not be None"
+        gpt.from_pretrained(gpt_ckpt_path, embed_path, experimental=experimental)
+        gpt.prepare(compile=compile and "cuda" in str(device))
+        self.gpt = gpt
+        self.logger.log(logging.INFO, "gpt loaded.")
+        self.speaker = Speaker(
+            self.config.gpt.hidden_size, self.config.spk_stat, device
+        )
+        self.logger.log(logging.INFO, "speaker loaded.")
+        decoder = (
+            DVAE(
+                decoder_config=asdict(self.config.decoder),
+                dim=self.config.decoder.idim,
+                coef=coef,
+                device=device,
+            )
+            .to(device)
+            .eval()
+        )
+        coef = str(decoder)
+        assert decoder_ckpt_path, "decoder_ckpt_path should not be None"
+        decoder.load_state_dict(
+            torch.load(decoder_ckpt_path, weights_only=True, mmap=True)
+        )
+        self.decoder = decoder
+        self.logger.log(logging.INFO, "decoder loaded.")
+        if tokenizer_path:
+            self.tokenizer = Tokenizer(tokenizer_path)
+            self.logger.log(logging.INFO, "tokenizer loaded.")
+        self.coef = coef
+        return self.has_loaded()
+    def _infer(
+        self,
+        text,
+        stream=False,
+        lang=None,
+        skip_refine_text=False,
+        refine_text_only=False,
+        use_decoder=True,
+        do_text_normalization=True,
+        do_homophone_replacement=True,
+        params_refine_text=RefineTextParams(),
+        params_infer_code=InferCodeParams(),
+    ):
+        assert self.has_loaded(use_decoder=use_decoder)
+        if not isinstance(text, list):
+            text = [text]
+        text = [
+            self.normalizer(
+                t,
+                do_text_normalization,
+                do_homophone_replacement,
+                lang,
+            )
+            for t in text
+        ]
+        self.logger.debug("normed texts %s", str(text))
+        if not skip_refine_text:
+            refined = self._refine_text(
+                text,
+                self.device,
+                params_refine_text,
+            )
+            text_tokens = refined.ids
+            text_tokens = [i[i.less(self.tokenizer.break_0_ids)] for i in text_tokens]
+            text = self.tokenizer.decode(text_tokens)
+            refined.destroy()
+            if refine_text_only:
+                yield text
+                return
+        if stream:
+            length = 0
+            pass_batch_count = 0
+        for result in self._infer_code(
+            text,
+            stream,
+            self.device,
+            use_decoder,
+            params_infer_code,
+        ):
+            wavs = self._decode_to_wavs(
+                result.hiddens if use_decoder else result.ids,
+                use_decoder,
+            )
+            result.destroy()
+            if stream:
+                pass_batch_count += 1
+                if pass_batch_count <= params_infer_code.pass_first_n_batches:
+                    continue
+                a = length
+                b = a + params_infer_code.stream_speed
+                if b > wavs.shape[1]:
+                    b = wavs.shape[1]
+                new_wavs = wavs[:, a:b]
+                length = b
+                yield new_wavs
+            else:
+                yield wavs
+        if stream:
+            new_wavs = wavs[:, length:]
+            # Identify rows with non-zero elements using np.any
+            # keep_rows = np.any(array != 0, axis=1)
+            keep_cols = np.sum(new_wavs != 0, axis=0) > 0
+            # Filter both rows and columns using slicing
+            yield new_wavs[:][:, keep_cols]
+    @torch.inference_mode()
+    def _vocos_decode(self, spec: torch.Tensor) -> np.ndarray:
+        if "mps" in str(self.device):
+            return self.vocos.decode(spec.cpu()).cpu().numpy()
+        else:
+            return self.vocos.decode(spec).cpu().numpy()
+    @torch.inference_mode()
+    def _decode_to_wavs(
+        self,
+        result_list: List[torch.Tensor],
+        use_decoder: bool,
+    ):
+        decoder = self.decoder if use_decoder else self.dvae
+        max_x_len = -1
+        if len(result_list) == 0:
+            return np.array([], dtype=np.float32)
+        for result in result_list:
+            if result.size(0) > max_x_len:
+                max_x_len = result.size(0)
+        batch_result = torch.zeros(
+            (len(result_list), result_list[0].size(1), max_x_len),
+            dtype=result_list[0].dtype,
+            device=result_list[0].device,
+        )
+        for i in range(len(result_list)):
+            src = result_list[i]
+            batch_result[i].narrow(1, 0, src.size(0)).copy_(src.permute(1, 0))
+            del src
+        del_all(result_list)
+        mel_specs = decoder(batch_result)
+        del batch_result
+        wavs = self._vocos_decode(mel_specs)
+        del mel_specs
+        return wavs
+    @torch.no_grad()
+    def _infer_code(
+        self,
+        text: Tuple[List[str], str],
+        stream: bool,
+        device: torch.device,
+        return_hidden: bool,
+        params: InferCodeParams,
+    ):
+        gpt = self.gpt
+        if not isinstance(text, list):
+            text = [text]
+        assert len(text), "text should not be empty"
+        if not isinstance(params.temperature, list):
+            temperature = [params.temperature] * self.config.gpt.num_vq
+        else:
+            temperature = params.temperature
+        input_ids, attention_mask, text_mask = self.tokenizer.encode(
+            self.speaker.decorate_code_prompts(
+                text,
+                params.prompt,
+                params.txt_smp,
+                params.spk_emb,
+            ),
+            self.config.gpt.num_vq,
+            prompt=(
+                self.speaker.decode_prompt(params.spk_smp)
+                if params.spk_smp is not None
+                else None
+            ),
+            device=self.device_gpt,
+        )
+        start_idx = input_ids.shape[-2]
+        num_code = self.config.gpt.num_audio_tokens - 1
+        logits_warpers, logits_processors = gen_logits(
+            num_code=num_code,
+            top_P=params.top_P,
+            top_K=params.top_K,
+            repetition_penalty=params.repetition_penalty,
+        )
+        if gpt.is_vllm:
+            from .model.velocity import SamplingParams
+            sample_params = SamplingParams(
+                temperature=temperature,
+                max_new_token=params.max_new_token,
+                max_tokens=8192,
+                min_new_token=params.min_new_token,
+                logits_processors=(logits_processors, logits_warpers),
+                eos_token=num_code,
+                infer_text=False,
+                start_idx=start_idx,
+            )
+            input_ids = [i.tolist() for i in input_ids]
+            result = gpt.llm.generate(
+                None,
+                sample_params,
+                input_ids,
+            )
+            token_ids = []
+            hidden_states = []
+            for i in result:
+                token_ids.append(torch.tensor(i.outputs[0].token_ids))
+                hidden_states.append(
+                    i.outputs[0].hidden_states.to(torch.float32).to(self.device)
+                )
+            del text_mask, input_ids
+            return [
+                GPT.GenerationOutputs(
+                    ids=token_ids,
+                    hiddens=hidden_states,
+                    attentions=[],
+                ),
+            ]
+        emb = self.embed(input_ids, text_mask)
+        del text_mask
+        if params.spk_emb is not None:
+            self.speaker.apply(
+                emb,
+                params.spk_emb,
+                input_ids,
+                self.tokenizer.spk_emb_ids,
+                self.gpt.device_gpt,
+            )
+        result = gpt.generate(
+            emb,
+            input_ids,
+            temperature=torch.tensor(temperature, device=device),
+            eos_token=num_code,
+            attention_mask=attention_mask,
+            max_new_token=params.max_new_token,
+            min_new_token=params.min_new_token,
+            logits_processors=(*logits_processors, *logits_warpers),
+            infer_text=False,
+            return_hidden=return_hidden,
+            stream=stream,
+            show_tqdm=params.show_tqdm,
+            ensure_non_empty=params.ensure_non_empty,
+            stream_batch=params.stream_batch,
+            manual_seed=params.manual_seed,
+            context=self.context,
+        )
+        del emb, input_ids
+        return result
+    @torch.no_grad()
+    def _refine_text(
+        self,
+        text: str,
+        device: torch.device,
+        params: RefineTextParams,
+    ):
+        gpt = self.gpt
+        if not isinstance(text, list):
+            text = [text]
+        input_ids, attention_mask, text_mask = self.tokenizer.encode(
+            self.speaker.decorate_text_prompts(text, params.prompt),
+            self.config.gpt.num_vq,
+            device=self.device_gpt,
+        )
+        logits_warpers, logits_processors = gen_logits(
+            num_code=self.tokenizer.len,
+            top_P=params.top_P,
+            top_K=params.top_K,
+            repetition_penalty=params.repetition_penalty,
+        )
+        if gpt.is_vllm:
+            from .model.velocity import SamplingParams
+            sample_params = SamplingParams(
+                repetition_penalty=params.repetition_penalty,
+                temperature=params.temperature,
+                top_p=params.top_P,
+                top_k=params.top_K,
+                max_new_token=params.max_new_token,
+                max_tokens=8192,
+                min_new_token=params.min_new_token,
+                logits_processors=(logits_processors, logits_warpers),
+                eos_token=self.tokenizer.eos_token,
+                infer_text=True,
+                start_idx=input_ids.shape[-2],
+            )
+            input_ids_list = [i.tolist() for i in input_ids]
+            del input_ids
+            result = gpt.llm.generate(
+                None, sample_params, input_ids_list, params.show_tqdm
+            )
+            token_ids = []
+            hidden_states = []
+            for i in result:
+                token_ids.append(torch.tensor(i.outputs[0].token_ids))
+                hidden_states.append(i.outputs[0].hidden_states)
+            del text_mask, input_ids_list, result
+            return GPT.GenerationOutputs(
+                ids=token_ids,
+                hiddens=hidden_states,
+                attentions=[],
+            )
+        emb = self.embed(input_ids, text_mask)
+        del text_mask
+        result = next(
+            gpt.generate(
+                emb,
+                input_ids,
+                temperature=torch.tensor([params.temperature], device=device),
+                eos_token=self.tokenizer.eos_token,
+                attention_mask=attention_mask,
+                max_new_token=params.max_new_token,
+                min_new_token=params.min_new_token,
+                logits_processors=(*logits_processors, *logits_warpers),
+                infer_text=True,
+                stream=False,
+                show_tqdm=params.show_tqdm,
+                ensure_non_empty=params.ensure_non_empty,
+                manual_seed=params.manual_seed,
+                context=self.context,
+            )
+        )
+        del emb, input_ids
+        return result

ChatTTS/model/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .dvae import DVAE
+from .embed import Embed
+from .gpt import GPT
+from .processors import gen_logits
+from .speaker import Speaker
+from .tokenizer import Tokenizer

ChatTTS/model/cuda/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .te_llama import TELlamaModel

ChatTTS/model/cuda/patch.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import torch
+class LlamaRMSNorm(torch.nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states: torch.Tensor):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight.to(hidden_states.device) * hidden_states.to(input_dtype)

ChatTTS/model/cuda/te_llama.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+#
+# From https://github.com/NVIDIA/TransformerEngine/blob/main/docs/examples/te_llama/te_llama.py
+#
+# Edited by fumiama.
+import re
+from contextlib import contextmanager
+from typing import Dict
+import transformer_engine as te
+from transformer_engine.pytorch.attention import RotaryPositionEmbedding
+import torch
+import transformers
+from transformers.models.llama.modeling_llama import (
+    LlamaModel,
+    LlamaConfig,
+)
+from transformers.modeling_utils import _load_state_dict_into_model
+from .patch import LlamaRMSNorm
+@contextmanager
+def replace_decoder(te_decoder_cls, llama_rms_norm_cls):
+    """
+    Replace `LlamaDecoderLayer` with custom `TELlamaDecoderLayer`.
+    """
+    original_llama_decoder_cls = (
+        transformers.models.llama.modeling_llama.LlamaDecoderLayer
+    )
+    transformers.models.llama.modeling_llama.LlamaDecoderLayer = te_decoder_cls
+    original_llama_rms_norm_cls = transformers.models.llama.modeling_llama.LlamaRMSNorm
+    transformers.models.llama.modeling_llama.LlamaRMSNorm = llama_rms_norm_cls
+    try:
+        yield
+    finally:
+        transformers.models.llama.modeling_llama.LlamaDecoderLayer = (
+            original_llama_decoder_cls
+        )
+        transformers.models.llama.modeling_llama.LlamaRMSNorm = (
+            original_llama_rms_norm_cls
+        )
+class TELlamaDecoderLayer(te.pytorch.TransformerLayer):
+    """
+    Wrapper class over TE's `TransformerLayer`. This makes the wrapper very
+    similar to HF's `LlamaDecoderLayer` and easier to replace it in the code.
+    Args:
+        config: LlamaConfig
+        args: positional args (for compatibility with `LlamaDecoderLayer`)
+        kwargs: keyword args (for compatibility with `LlamaDecoderLayer`)
+    """
+    def __init__(self, config, *args, **kwargs):
+        super().__init__(
+            hidden_size=config.hidden_size,
+            ffn_hidden_size=config.intermediate_size,
+            num_attention_heads=config.num_attention_heads,
+            bias=False,
+            layernorm_epsilon=config.rms_norm_eps,
+            hidden_dropout=0,
+            attention_dropout=0,
+            fuse_qkv_params=False,
+            normalization="RMSNorm",
+            activation="swiglu",
+            attn_input_format="bshd",
+            num_gqa_groups=config.num_key_value_heads,
+        )
+        te_rope = RotaryPositionEmbedding(
+            config.hidden_size // config.num_attention_heads
+        )
+        self.te_rope_emb = te_rope(max_seq_len=config.max_position_embeddings).cuda()
+    def forward(self, hidden_states, *args, attention_mask, **kwargs):
+        """
+        Custom forward to make sure we only pass relevant arguments to the
+        forward pass of the `TransformerLayer`. Also, make sure the output
+        format matches the output of the HF's `LlamaDecoderLayer`.
+        """
+        return (
+            super().forward(
+                hidden_states,
+                attention_mask=attention_mask,
+                rotary_pos_emb=self.te_rope_emb,
+            ),
+        )
+class TELlamaModel:
+    """
+    LM created with `LlamaModel`. The underlying `LlamaDecoderLayer`
+    class is monkey-patched with `TELlamaDecoderLayer` class before
+    initializing the causal LM with `LlamaModel`.
+    Args:
+        config: LlamaConfig
+    """
+    def __new__(cls, config: LlamaConfig):
+        with replace_decoder(
+            te_decoder_cls=TELlamaDecoderLayer, llama_rms_norm_cls=LlamaRMSNorm
+        ):
+            model = LlamaModel(config)
+        return model
+    @classmethod
+    def from_state_dict(
+        cls,
+        state_dict: Dict[str, torch.Tensor],
+        config: LlamaConfig,
+    ):
+        """
+        Custom method adapted from `from_pretrained` method in HuggingFace
+        Transformers repo: https://github.com/huggingface/transformers/blob/f497f564bb76697edab09184a252fc1b1a326d1e/src/transformers/modeling_utils.py#L2579
+        """
+        vanilla_model = cls(config)
+        # replace_params copies parameters relevant only to TransformerEngine
+        _replace_params(state_dict, vanilla_model.state_dict(), config)
+        # _load_state_dict_into_model copies parameters other than those in TransformerEngine
+        _load_state_dict_into_model(vanilla_model, state_dict, start_prefix="")
+        return vanilla_model
+def _replace_params(hf_state_dict, te_state_dict, config):
+    # collect all layer prefixes to update
+    all_layer_prefixes = set()
+    for param_key in hf_state_dict.keys():
+        layer_prefix_pat = "model.layers.\d+."
+        m = re.match(layer_prefix_pat, param_key)
+        if m is not None:
+            all_layer_prefixes.add(m.group())
+    for layer_prefix in all_layer_prefixes:
+        # When loading weights into models with less number of layers, skip the
+        # copy if the corresponding layer doesn't exist in HF model
+        if layer_prefix + "input_layernorm.weight" in hf_state_dict:
+            te_state_dict[
+                layer_prefix + "self_attention.layernorm_qkv.layer_norm_weight"
+            ].data[:] = hf_state_dict[layer_prefix + "input_layernorm.weight"].data[:]
+        if layer_prefix + "self_attn.q_proj.weight" in hf_state_dict:
+            te_state_dict[
+                layer_prefix + "self_attention.layernorm_qkv.query_weight"
+            ].data[:] = hf_state_dict[layer_prefix + "self_attn.q_proj.weight"].data[:]
+        if layer_prefix + "self_attn.k_proj.weight" in hf_state_dict:
+            te_state_dict[
+                layer_prefix + "self_attention.layernorm_qkv.key_weight"
+            ].data[:] = hf_state_dict[layer_prefix + "self_attn.k_proj.weight"].data[:]
+        if layer_prefix + "self_attn.v_proj.weight" in hf_state_dict:
+            te_state_dict[
+                layer_prefix + "self_attention.layernorm_qkv.value_weight"
+            ].data[:] = hf_state_dict[layer_prefix + "self_attn.v_proj.weight"].data[:]
+        if layer_prefix + "self_attn.o_proj.weight" in hf_state_dict:
+            te_state_dict[layer_prefix + "self_attention.proj.weight"].data[:] = (
+                hf_state_dict[layer_prefix + "self_attn.o_proj.weight"].data[:]
+            )
+        if layer_prefix + "post_attention_layernorm.weight" in hf_state_dict:
+            te_state_dict[layer_prefix + "layernorm_mlp.layer_norm_weight"].data[:] = (
+                hf_state_dict[layer_prefix + "post_attention_layernorm.weight"].data[:]
+            )
+        # It may happen that gate_proj.weight and up_proj.weight will be in the different files, so we need to
+        # load them separately.
+        if layer_prefix + "mlp.gate_proj.weight" in hf_state_dict:
+            te_state_dict[layer_prefix + "layernorm_mlp.fc1_weight"].data[
+                : config.intermediate_size
+            ] = hf_state_dict[layer_prefix + "mlp.gate_proj.weight"].data
+        if layer_prefix + "mlp.up_proj.weight" in hf_state_dict:
+            te_state_dict[layer_prefix + "layernorm_mlp.fc1_weight"].data[
+                config.intermediate_size :
+            ] = hf_state_dict[layer_prefix + "mlp.up_proj.weight"].data
+        if layer_prefix + "mlp.down_proj.weight" in hf_state_dict:
+            te_state_dict[layer_prefix + "layernorm_mlp.fc2_weight"].data[:] = (
+                hf_state_dict[layer_prefix + "mlp.down_proj.weight"].data[:]
+            )
+    return all_layer_prefixes

ChatTTS/model/dvae.py ADDED Viewed

	@@ -0,0 +1,296 @@

+import math
+from typing import List, Optional, Literal, Union
+import numpy as np
+import pybase16384 as b14
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+from vector_quantize_pytorch import GroupedResidualFSQ
+class ConvNeXtBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int,
+        kernel: int,
+        dilation: int,
+        layer_scale_init_value: float = 1e-6,
+    ):
+        # ConvNeXt Block copied from Vocos.
+        super().__init__()
+        self.dwconv = nn.Conv1d(
+            dim,
+            dim,
+            kernel_size=kernel,
+            padding=dilation * (kernel // 2),
+            dilation=dilation,
+            groups=dim,
+        )  # depthwise conv
+        self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim, intermediate_dim
+        )  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+        self.gamma = (
+            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+    def forward(self, x: torch.Tensor, cond=None) -> torch.Tensor:
+        residual = x
+        y = self.dwconv(x)
+        y.transpose_(1, 2)  # (B, C, T) -> (B, T, C)
+        x = self.norm(y)
+        del y
+        y = self.pwconv1(x)
+        del x
+        x = self.act(y)
+        del y
+        y = self.pwconv2(x)
+        del x
+        if self.gamma is not None:
+            y *= self.gamma
+        y.transpose_(1, 2)  # (B, T, C) -> (B, C, T)
+        x = y + residual
+        del y
+        return x
+class GFSQ(nn.Module):
+    def __init__(
+        self, dim: int, levels: List[int], G: int, R: int, eps=1e-5, transpose=True
+    ):
+        super(GFSQ, self).__init__()
+        self.quantizer = GroupedResidualFSQ(
+            dim=dim,
+            levels=list(levels),
+            num_quantizers=R,
+            groups=G,
+        )
+        self.n_ind = math.prod(levels)
+        self.eps = eps
+        self.transpose = transpose
+        self.G = G
+        self.R = R
+    def _embed(self, x: torch.Tensor):
+        if self.transpose:
+            x = x.transpose(1, 2)
+        """
+        x = rearrange(
+            x, "b t (g r) -> g b t r", g = self.G, r = self.R,
+        )
+        """
+        x = x.view(x.size(0), x.size(1), self.G, self.R).permute(2, 0, 1, 3)
+        feat = self.quantizer.get_output_from_indices(x)
+        return feat.transpose_(1, 2) if self.transpose else feat
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        return super().__call__(x)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.transpose:
+            x.transpose_(1, 2)
+        # feat, ind = self.quantizer(x)
+        _, ind = self.quantizer(x)
+        """
+        ind = rearrange(
+            ind, "g b t r ->b t (g r)",
+        )
+        """
+        ind = ind.permute(1, 2, 0, 3).contiguous()
+        ind = ind.view(ind.size(0), ind.size(1), -1)
+        """
+        embed_onehot_tmp = F.one_hot(ind.long(), self.n_ind)
+        embed_onehot = embed_onehot_tmp.to(x.dtype)
+        del embed_onehot_tmp
+        e_mean = torch.mean(embed_onehot, dim=[0, 1])
+        # e_mean = e_mean / (e_mean.sum(dim=1) + self.eps).unsqueeze(1)
+        torch.div(e_mean, (e_mean.sum(dim=1) + self.eps).unsqueeze(1), out=e_mean)
+        perplexity = torch.exp(-torch.sum(e_mean * torch.log(e_mean + self.eps), dim=1))
+        return
+            torch.zeros(perplexity.shape, dtype=x.dtype, device=x.device),
+            feat.transpose_(1, 2) if self.transpose else feat,
+            perplexity,
+        """
+        return ind.transpose_(1, 2) if self.transpose else ind
+class DVAEDecoder(nn.Module):
+    def __init__(
+        self,
+        idim: int,
+        odim: int,
+        n_layer=12,
+        bn_dim=64,
+        hidden=256,
+        kernel=7,
+        dilation=2,
+        up=False,
+    ):
+        super().__init__()
+        self.up = up
+        self.conv_in = nn.Sequential(
+            nn.Conv1d(idim, bn_dim, 3, 1, 1),
+            nn.GELU(),
+            nn.Conv1d(bn_dim, hidden, 3, 1, 1),
+        )
+        self.decoder_block = nn.ModuleList(
+            [
+                ConvNeXtBlock(
+                    hidden,
+                    hidden * 4,
+                    kernel,
+                    dilation,
+                )
+                for _ in range(n_layer)
+            ]
+        )
+        self.conv_out = nn.Conv1d(hidden, odim, kernel_size=1, bias=False)
+    def forward(self, x: torch.Tensor, conditioning=None) -> torch.Tensor:
+        # B, C, T
+        y = self.conv_in(x)
+        del x
+        for f in self.decoder_block:
+            y = f(y, conditioning)
+        x = self.conv_out(y)
+        del y
+        return x
+class MelSpectrogramFeatures(torch.nn.Module):
+    def __init__(
+        self,
+        sample_rate=24000,
+        n_fft=1024,
+        hop_length=256,
+        n_mels=100,
+        padding: Literal["center", "same"] = "center",
+        device: torch.device = torch.device("cpu"),
+    ):
+        super().__init__()
+        self.device = device
+        if padding not in ["center", "same"]:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        self.padding = padding
+        self.mel_spec = torchaudio.transforms.MelSpectrogram(
+            sample_rate=sample_rate,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            n_mels=n_mels,
+            center=padding == "center",
+            power=1,
+        )
+    def __call__(self, audio: torch.Tensor) -> torch.Tensor:
+        return super().__call__(audio)
+    def forward(self, audio: torch.Tensor) -> torch.Tensor:
+        audio = audio.to(self.device)
+        mel: torch.Tensor = self.mel_spec(audio)
+        features = torch.log(torch.clip(mel, min=1e-5))
+        return features
+class DVAE(nn.Module):
+    def __init__(
+        self,
+        decoder_config: dict,
+        encoder_config: Optional[dict] = None,
+        vq_config: Optional[dict] = None,
+        dim=512,
+        coef: Optional[str] = None,
+        device: torch.device = torch.device("cpu"),
+    ):
+        super().__init__()
+        if coef is None:
+            coef = torch.rand(100)
+        else:
+            coef = torch.from_numpy(
+                np.frombuffer(b14.decode_from_string(coef), dtype=np.float32).copy()
+            )
+        self.register_buffer("coef", coef.unsqueeze(0).unsqueeze_(2))
+        if encoder_config is not None:
+            self.downsample_conv = nn.Sequential(
+                nn.Conv1d(100, dim, 3, 1, 1),
+                nn.GELU(),
+                nn.Conv1d(dim, dim, 4, 2, 1),
+                nn.GELU(),
+            )
+            self.preprocessor_mel = MelSpectrogramFeatures(device=device)
+            self.encoder: Optional[DVAEDecoder] = DVAEDecoder(**encoder_config)
+        self.decoder = DVAEDecoder(**decoder_config)
+        self.out_conv = nn.Conv1d(dim, 100, 3, 1, 1, bias=False)
+        if vq_config is not None:
+            self.vq_layer = GFSQ(**vq_config)
+        else:
+            self.vq_layer = None
+    def __repr__(self) -> str:
+        return b14.encode_to_string(
+            self.coef.cpu().numpy().astype(np.float32).tobytes()
+        )
+    def __call__(
+        self, inp: torch.Tensor, mode: Literal["encode", "decode"] = "decode"
+    ) -> torch.Tensor:
+        return super().__call__(inp, mode)
+    @torch.inference_mode()
+    def forward(
+        self, inp: torch.Tensor, mode: Literal["encode", "decode"] = "decode"
+    ) -> torch.Tensor:
+        if mode == "encode" and hasattr(self, "encoder") and self.vq_layer is not None:
+            mel = self.preprocessor_mel(inp)
+            x: torch.Tensor = self.downsample_conv(
+                torch.div(mel, self.coef.view(100, 1).expand(mel.shape), out=mel),
+            ).unsqueeze_(0)
+            del mel
+            x = self.encoder(x)
+            ind = self.vq_layer(x)
+            del x
+            return ind
+        if self.vq_layer is not None:
+            vq_feats = self.vq_layer._embed(inp)
+        else:
+            vq_feats = inp
+        vq_feats = (
+            vq_feats.view(
+                (vq_feats.size(0), 2, vq_feats.size(1) // 2, vq_feats.size(2)),
+            )
+            .permute(0, 2, 3, 1)
+            .flatten(2)
+        )
+        dec_out = self.out_conv(
+            self.decoder(
+                x=vq_feats,
+            ),
+        )
+        del vq_feats
+        return torch.mul(dec_out, self.coef, out=dec_out)
+    @torch.inference_mode()
+    def sample_audio(self, wav: Union[np.ndarray, torch.Tensor]) -> torch.Tensor:
+        if isinstance(wav, np.ndarray):
+            wav = torch.from_numpy(wav)
+        return self(wav, "encode").squeeze_(0)

ChatTTS/model/embed.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from safetensors.torch import safe_open
+import torch
+import torch.nn as nn
+from torch.nn.utils.parametrizations import weight_norm
+class Embed(nn.Module):
+    def __init__(
+        self, hidden_size: int, num_audio_tokens: int, num_text_tokens: int, num_vq=4
+    ):
+        super().__init__()
+        self.num_vq = num_vq
+        self.num_audio_tokens = num_audio_tokens
+        self.model_dim = hidden_size
+        self.emb_code = nn.ModuleList(
+            [nn.Embedding(num_audio_tokens, self.model_dim) for _ in range(num_vq)],
+        )
+        self.emb_text = nn.Embedding(num_text_tokens, self.model_dim)
+        self.head_text = weight_norm(
+            nn.Linear(self.model_dim, num_text_tokens, bias=False),
+            name="weight",
+        )
+        self.head_code = nn.ModuleList(
+            [
+                weight_norm(
+                    nn.Linear(self.model_dim, num_audio_tokens, bias=False),
+                    name="weight",
+                )
+                for _ in range(self.num_vq)
+            ],
+        )
+    @torch.inference_mode()
+    def from_pretrained(self, filename: str, device: torch.device):
+        state_dict_tensors = {}
+        with safe_open(filename, framework="pt") as f:
+            for k in f.keys():
+                state_dict_tensors[k] = f.get_tensor(k)
+        self.load_state_dict(state_dict_tensors)
+        self.to(device)
+    def __call__(
+        self, input_ids: torch.Tensor, text_mask: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        get_emb
+        """
+        return super().__call__(input_ids, text_mask)
+    @torch.inference_mode()
+    def forward(self, input_ids: torch.Tensor, text_mask: torch.Tensor) -> torch.Tensor:
+        """
+        get_emb
+        """
+        device = next(self.parameters()).device
+        emb_text: torch.Tensor = self.emb_text(
+            input_ids[text_mask].narrow(1, 0, 1).squeeze_(1).to(device)
+        )
+        text_mask_inv = text_mask.logical_not().to(device)
+        masked_input_ids: torch.Tensor = input_ids[text_mask_inv].to(device)
+        emb_code = [
+            self.emb_code[i](masked_input_ids[:, i]) for i in range(self.num_vq)
+        ]
+        emb_code = torch.stack(emb_code, 2).sum(2)
+        emb = torch.zeros(
+            (input_ids.shape[:-1]) + (emb_text.shape[-1],),
+            device=emb_text.device,
+            dtype=emb_text.dtype,
+        )
+        emb[text_mask] = emb_text
+        emb[text_mask_inv] = emb_code.to(emb.dtype)
+        del emb_text, emb_code, text_mask_inv
+        return emb

ChatTTS/model/gpt.py ADDED Viewed

	@@ -0,0 +1,613 @@

+import platform
+from dataclasses import dataclass
+import logging
+from typing import Union, List, Optional, Tuple, Callable
+import gc
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.utils.parametrize as P
+from tqdm import tqdm
+from transformers import LlamaModel, LlamaConfig
+from transformers.cache_utils import Cache
+from transformers.modeling_outputs import BaseModelOutputWithPast
+from transformers.utils import is_flash_attn_2_available
+from ..utils import del_all
+from .embed import Embed
+class GPT(nn.Module):
+    def __init__(
+        self,
+        gpt_config: dict,
+        embed: Embed,
+        use_flash_attn=False,
+        use_vllm=False,
+        device=torch.device("cpu"),
+        device_gpt=torch.device("cpu"),
+        logger=logging.getLogger(__name__),
+    ):
+        super().__init__()
+        self.logger = logger
+        self.device = device
+        self.device_gpt = device_gpt
+        self.generator = torch.Generator(device=device)
+        self.num_vq = int(gpt_config["num_vq"])
+        self.num_audio_tokens = int(gpt_config["num_audio_tokens"])
+        self.num_text_tokens = int(gpt_config["num_text_tokens"])
+        self.use_flash_attn = use_flash_attn
+        self.is_te_llama = False
+        self.is_vllm = use_vllm
+        if self.is_vllm:
+            return
+        self.llama_config = self._build_llama_config(gpt_config)
+        self.emb_code = [ec.__call__ for ec in embed.emb_code]
+        self.emb_text = embed.emb_text.__call__
+        self.head_text = embed.head_text.__call__
+        self.head_code = [hc.__call__ for hc in embed.head_code]
+    def from_pretrained(
+        self, gpt_folder: str, embed_file_path: str, experimental=False
+    ):
+        if self.is_vllm and platform.system().lower() == "linux":
+            from .velocity import LLM
+            self.llm = LLM(
+                model=gpt_folder,
+                num_audio_tokens=self.num_audio_tokens,
+                num_text_tokens=self.num_text_tokens,
+                post_model_path=embed_file_path,
+            )
+            self.logger.info("vLLM model loaded")
+            return
+        self.gpt: LlamaModel = LlamaModel.from_pretrained(gpt_folder).to(
+            self.device_gpt
+        )
+        del self.gpt.embed_tokens
+        if (
+            experimental
+            and "cuda" in str(self.device_gpt)
+            and platform.system().lower() == "linux"
+        ):  # is TELlamaModel
+            try:
+                from .cuda import TELlamaModel
+                self.logger.warning(
+                    "Linux with CUDA, try NVIDIA accelerated TELlamaModel because experimental is enabled"
+                )
+                state_dict = self.gpt.state_dict()
+                vanilla = TELlamaModel.from_state_dict(state_dict, self.llama_config)
+                # Force mem release. Taken from huggingface code
+                del state_dict, self.gpt
+                gc.collect()
+                self.gpt = vanilla
+                self.is_te_llama = True
+            except Exception as e:
+                self.logger.warning(
+                    f"use default LlamaModel for importing TELlamaModel error: {e}"
+                )
+    class Context:
+        def __init__(self):
+            self._interrupt = False
+        def set(self, v: bool):
+            self._interrupt = v
+        def get(self) -> bool:
+            return self._interrupt
+    def _build_llama_config(
+        self,
+        config: dict,
+    ) -> Tuple[LlamaModel, LlamaConfig]:
+        if self.use_flash_attn and is_flash_attn_2_available():
+            llama_config = LlamaConfig(
+                **config,
+                attn_implementation="flash_attention_2",
+            )
+            self.logger.warning(
+                "enabling flash_attention_2 may make gpt be even slower"
+            )
+        else:
+            llama_config = LlamaConfig(**config)
+        return llama_config
+    def prepare(self, compile=False):
+        if self.use_flash_attn and is_flash_attn_2_available():
+            self.gpt = self.gpt.to(dtype=torch.float16)
+        if compile and not self.is_te_llama and not self.is_vllm:
+            try:
+                self.compile(backend="inductor", dynamic=True)
+                self.gpt.compile(backend="inductor", dynamic=True)
+            except RuntimeError as e:
+                self.logger.warning(f"compile failed: {e}. fallback to normal mode.")
+    @dataclass(repr=False, eq=False)
+    class _GenerationInputs:
+        position_ids: torch.Tensor
+        cache_position: torch.Tensor
+        use_cache: bool
+        input_ids: Optional[torch.Tensor] = None
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+        attention_mask: Optional[torch.Tensor] = None
+        inputs_embeds: Optional[torch.Tensor] = None
+        def to(self, device: torch.device, dtype: torch.dtype):
+            if self.attention_mask is not None:
+                self.attention_mask = self.attention_mask.to(device, dtype=dtype)
+            if self.position_ids is not None:
+                self.position_ids = self.position_ids.to(device, dtype=dtype)
+            if self.inputs_embeds is not None:
+                self.inputs_embeds = self.inputs_embeds.to(device, dtype=dtype)
+            if self.cache_position is not None:
+                self.cache_position = self.cache_position.to(device, dtype=dtype)
+    @torch.no_grad()
+    def _prepare_generation_inputs(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        use_cache=True,
+    ) -> _GenerationInputs:
+        # With static cache, the `past_key_values` is None
+        # TODO joao: standardize interface for the different Cache classes and remove of this if
+        has_static_cache = False
+        if past_key_values is None:
+            if hasattr(self.gpt.layers[0], "self_attn"):
+                past_key_values = getattr(
+                    self.gpt.layers[0].self_attn, "past_key_value", None
+                )
+            has_static_cache = past_key_values is not None
+        past_length = 0
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                past_length = (
+                    int(cache_position[0])
+                    if cache_position is not None
+                    else past_key_values.get_seq_length()
+                )
+                max_cache_length = past_key_values.get_max_length()
+                cache_length = (
+                    past_length
+                    if max_cache_length is None
+                    else min(max_cache_length, past_length)
+                )
+            # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if (
+                attention_mask is not None
+                and attention_mask.shape[1] > input_ids.shape[1]
+            ):
+                start = attention_mask.shape[1] - past_length
+                input_ids = input_ids.narrow(1, -start, start)
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids.narrow(
+                    1, past_length, input_ids.size(1) - past_length
+                )
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask.narrow(
+                    1, -max_cache_length, max_cache_length
+                )
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask.eq(0), 1)
+            if past_key_values:
+                position_ids = position_ids.narrow(
+                    1, -input_ids.shape[1], input_ids.shape[1]
+                )
+        input_length = (
+            position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1]
+        )
+        if cache_position is None:
+            cache_position = torch.arange(
+                past_length, past_length + input_length, device=input_ids.device
+            )
+        else:
+            cache_position = cache_position.narrow(0, -input_length, input_length)
+        if has_static_cache:
+            past_key_values = None
+        model_inputs = self._GenerationInputs(
+            position_ids=position_ids,
+            cache_position=cache_position,
+            use_cache=use_cache,
+        )
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs.inputs_embeds = inputs_embeds
+        else:
+            # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise
+            # recompiles graphs as the stride of the inputs is a guard. Ref: https://github.com/huggingface/transformers/pull/29114
+            # TODO: use `next_tokens` directly instead.
+            model_inputs.input_ids = input_ids.contiguous()
+        model_inputs.past_key_values = past_key_values
+        model_inputs.attention_mask = attention_mask
+        return model_inputs
+    @dataclass(repr=False, eq=False)
+    class GenerationOutputs:
+        ids: List[torch.Tensor]
+        attentions: List[Optional[Tuple[torch.FloatTensor, ...]]]
+        hiddens: List[torch.Tensor]
+        def destroy(self):
+            del_all(self.ids)
+            del_all(self.attentions)
+            del_all(self.hiddens)
+    @torch.no_grad()
+    def _prepare_generation_outputs(
+        self,
+        inputs_ids: torch.Tensor,
+        start_idx: int,
+        end_idx: torch.Tensor,
+        attentions: List[Optional[Tuple[torch.FloatTensor, ...]]],
+        hiddens: List[torch.Tensor],
+        infer_text: bool,
+    ) -> GenerationOutputs:
+        inputs_ids = [
+            inputs_ids[idx].narrow(0, start_idx, i) for idx, i in enumerate(end_idx)
+        ]
+        if infer_text:
+            inputs_ids = [i.narrow(1, 0, 1).squeeze_(1) for i in inputs_ids]
+        if len(hiddens) > 0:
+            hiddens = torch.stack(hiddens, 1)
+            hiddens = [
+                hiddens[idx].narrow(0, 0, i) for idx, i in enumerate(end_idx.int())
+            ]
+        return self.GenerationOutputs(
+            ids=inputs_ids,
+            attentions=attentions,
+            hiddens=hiddens,
+        )
+    @torch.no_grad()
+    def generate(
+        self,
+        emb: torch.Tensor,
+        inputs_ids: torch.Tensor,
+        temperature: torch.Tensor,
+        eos_token: Union[int, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        max_new_token=2048,
+        min_new_token=0,
+        logits_processors: Tuple[
+            Callable[[torch.LongTensor, torch.FloatTensor], torch.FloatTensor]
+        ] = (),
+        infer_text=False,
+        return_attn=False,
+        return_hidden=False,
+        stream=False,
+        show_tqdm=True,
+        ensure_non_empty=True,
+        stream_batch=24,
+        manual_seed: Optional[int] = None,
+        context=Context(),
+    ):
+        attentions: List[Optional[Tuple[torch.FloatTensor, ...]]] = []
+        hiddens = []
+        stream_iter = 0
+        start_idx, end_idx = inputs_ids.shape[1], torch.zeros(
+            inputs_ids.shape[0], device=inputs_ids.device, dtype=torch.long
+        )
+        finish = torch.zeros(inputs_ids.shape[0], device=inputs_ids.device).bool()
+        old_temperature = temperature
+        temperature = (
+            temperature.unsqueeze(0)
+            .expand(inputs_ids.shape[0], -1)
+            .contiguous()
+            .view(-1, 1)
+        )
+        attention_mask_cache = torch.ones(
+            (
+                inputs_ids.shape[0],
+                inputs_ids.shape[1] + max_new_token,
+            ),
+            dtype=torch.bool,
+            device=inputs_ids.device,
+        )
+        if attention_mask is not None:
+            attention_mask_cache.narrow(1, 0, attention_mask.shape[1]).copy_(
+                attention_mask
+            )
+        progress = inputs_ids.size(1)
+        # pre-allocate inputs_ids
+        inputs_ids_buf = torch.zeros(
+            inputs_ids.size(0),
+            progress + max_new_token,
+            inputs_ids.size(2),
+            dtype=inputs_ids.dtype,
+            device=inputs_ids.device,
+        )
+        inputs_ids_buf.narrow(1, 0, progress).copy_(inputs_ids)
+        del inputs_ids
+        inputs_ids = inputs_ids_buf.narrow(1, 0, progress)
+        pbar: Optional[tqdm] = None
+        if show_tqdm:
+            pbar = tqdm(
+                total=max_new_token,
+                desc="text" if infer_text else "code",
+                bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}(max) [{elapsed}, {rate_fmt}{postfix}]",
+            )
+        past_key_values = None
+        for i in range(max_new_token):
+            model_input = self._prepare_generation_inputs(
+                inputs_ids,
+                past_key_values,
+                attention_mask_cache.narrow(1, 0, inputs_ids.shape[1]),
+                use_cache=not self.is_te_llama,
+            )
+            if i > 0:
+                del emb
+                inputs_ids_emb = model_input.input_ids.to(self.device_gpt)
+                if infer_text:
+                    emb: torch.Tensor = self.emb_text(inputs_ids_emb[:, :, 0])
+                else:
+                    code_emb = [
+                        self.emb_code[i](inputs_ids_emb[:, :, i])
+                        for i in range(self.num_vq)
+                    ]
+                    emb = torch.stack(code_emb, 3).sum(3)
+                del inputs_ids_emb, model_input.input_ids
+            model_input.inputs_embeds = emb
+            model_input.to(self.device_gpt, self.gpt.dtype)
+            outputs: BaseModelOutputWithPast = self.gpt(
+                attention_mask=model_input.attention_mask,
+                position_ids=model_input.position_ids,
+                past_key_values=model_input.past_key_values,
+                inputs_embeds=model_input.inputs_embeds,
+                use_cache=model_input.use_cache,
+                output_attentions=return_attn,
+                cache_position=model_input.cache_position,
+            )
+            del_all(model_input)
+            attentions.append(outputs.attentions)
+            hidden_states = outputs.last_hidden_state.to(
+                self.device, dtype=torch.float
+            )  # 🐻
+            past_key_values = outputs.past_key_values
+            del_all(outputs)
+            if return_hidden:
+                hiddens.append(hidden_states.narrow(1, -1, 1).squeeze_(1))
+            with P.cached():
+                if infer_text:
+                    logits: torch.Tensor = self.head_text(hidden_states)
+                else:
+                    # logits = torch.stack([self.head_code[i](hidden_states) for i in range(self.num_vq)], 3)
+                    logits = torch.empty(
+                        hidden_states.size(0),
+                        hidden_states.size(1),
+                        self.num_audio_tokens,
+                        self.num_vq,
+                        dtype=torch.float,
+                        device=self.device,
+                    )
+                    for num_vq_iter in range(self.num_vq):
+                        x: torch.Tensor = self.head_code[num_vq_iter](hidden_states)
+                        logits[..., num_vq_iter] = x
+                        del x
+            del hidden_states
+            # logits = logits[:, -1].float()
+            logits = logits.narrow(1, -1, 1).squeeze_(1).float()
+            if not infer_text:
+                # logits = rearrange(logits, "b c n -> (b n) c")
+                logits = logits.permute(0, 2, 1)
+                logits = logits.reshape(-1, logits.size(2))
+                # logits_token = rearrange(inputs_ids[:, start_idx:], "b c n -> (b n) c")
+                inputs_ids_sliced = inputs_ids.narrow(
+                    1,
+                    start_idx,
+                    inputs_ids.size(1) - start_idx,
+                ).permute(0, 2, 1)
+                logits_token = inputs_ids_sliced.reshape(
+                    inputs_ids_sliced.size(0) * inputs_ids_sliced.size(1),
+                    -1,
+                ).to(self.device)
+                del inputs_ids_sliced
+            else:
+                logits_token = (
+                    inputs_ids.narrow(
+                        1,
+                        start_idx,
+                        inputs_ids.size(1) - start_idx,
+                    )
+                    .narrow(2, 0, 1)
+                    .to(self.device)
+                )
+            logits /= temperature
+            for logitsProcessors in logits_processors:
+                logits = logitsProcessors(logits_token, logits)
+            del logits_token
+            if i < min_new_token:
+                logits[:, eos_token] = -torch.inf
+            scores = F.softmax(logits, dim=-1)
+            del logits
+            if manual_seed is None:
+                idx_next = torch.multinomial(scores, num_samples=1).to(finish.device)
+            else:
+                idx_next = torch.multinomial(
+                    scores,
+                    num_samples=1,
+                    generator=self.generator.manual_seed(manual_seed),
+                ).to(finish.device)
+            del scores
+            if not infer_text:
+                # idx_next = rearrange(idx_next, "(b n) 1 -> b n", n=self.num_vq)
+                idx_next = idx_next.view(-1, self.num_vq)
+                finish_or = idx_next.eq(eos_token).any(1)
+                finish.logical_or_(finish_or)
+                del finish_or
+                inputs_ids_buf.narrow(1, progress, 1).copy_(idx_next.unsqueeze_(1))
+            else:
+                finish_or = idx_next.eq(eos_token).any(1)
+                finish.logical_or_(finish_or)
+                del finish_or
+                inputs_ids_buf.narrow(1, progress, 1).copy_(
+                    idx_next.unsqueeze_(-1).expand(-1, -1, self.num_vq),
+                )
+            if i == 0 and finish.any():
+                self.logger.warning(
+                    "unexpected end at index %s",
+                    str([unexpected_idx.item() for unexpected_idx in finish.nonzero()]),
+                )
+                if ensure_non_empty and manual_seed is None:
+                    if show_tqdm:
+                        pbar.close()
+                    self.logger.warning("regenerate in order to ensure non-empty")
+                    del_all(attentions)
+                    del_all(hiddens)
+                    del (
+                        start_idx,
+                        end_idx,
+                        finish,
+                        temperature,
+                        attention_mask_cache,
+                        past_key_values,
+                        idx_next,
+                        inputs_ids_buf,
+                    )
+                    new_gen = self.generate(
+                        emb,
+                        inputs_ids,
+                        old_temperature,
+                        eos_token,
+                        attention_mask,
+                        max_new_token,
+                        min_new_token,
+                        logits_processors,
+                        infer_text,
+                        return_attn,
+                        return_hidden,
+                        stream,
+                        show_tqdm,
+                        ensure_non_empty,
+                        stream_batch,
+                        manual_seed,
+                        context,
+                    )
+                    for result in new_gen:
+                        yield result
+                    del inputs_ids
+                return
+            del idx_next
+            progress += 1
+            inputs_ids = inputs_ids_buf.narrow(1, 0, progress)
+            not_finished = finish.logical_not().to(end_idx.device)
+            end_idx.add_(not_finished.int())
+            stream_iter += not_finished.any().int()
+            if stream:
+                if stream_iter > 0 and stream_iter % stream_batch == 0:
+                    self.logger.debug("yield stream result, end: %d", end_idx)
+                    yield self._prepare_generation_outputs(
+                        inputs_ids,
+                        start_idx,
+                        end_idx,
+                        attentions,
+                        hiddens,
+                        infer_text,
+                    )
+            del not_finished
+            if finish.all() or context.get():
+                break
+            if pbar is not None:
+                pbar.update(1)
+        if pbar is not None:
+            pbar.close()
+        if not finish.all():
+            if context.get():
+                self.logger.warning("generation is interrupted")
+            else:
+                self.logger.warning(
+                    f"incomplete result. hit max_new_token: {max_new_token}"
+                )
+        del finish, inputs_ids_buf
+        yield self._prepare_generation_outputs(
+            inputs_ids,
+            start_idx,
+            end_idx,
+            attentions,
+            hiddens,
+            infer_text,
+        )

ChatTTS/model/processors.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import torch
+import torch.nn.functional as F
+from transformers.generation import TopKLogitsWarper, TopPLogitsWarper
+class CustomRepetitionPenaltyLogitsProcessorRepeat:
+    def __init__(self, penalty: float, max_input_ids: int, past_window: int):
+        if not isinstance(penalty, float) or not (penalty > 0):
+            raise ValueError(
+                f"`penalty` has to be a strictly positive float, but is {penalty}"
+            )
+        self.penalty = penalty
+        self.max_input_ids = max_input_ids
+        self.past_window = past_window
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        if input_ids.size(1) > self.past_window:
+            input_ids = input_ids.narrow(1, -self.past_window, self.past_window)
+        freq = F.one_hot(input_ids, scores.size(1)).sum(1)
+        if freq.size(0) > self.max_input_ids:
+            freq.narrow(
+                0, self.max_input_ids, freq.size(0) - self.max_input_ids
+            ).zero_()
+        alpha = torch.pow(self.penalty, freq)
+        scores = scores.contiguous()
+        inp = scores.multiply(alpha)
+        oth = scores.divide(alpha)
+        con = scores < 0
+        out = torch.where(con, inp, oth)
+        del inp, oth, scores, con, alpha
+        return out
+def gen_logits(
+    num_code: int,
+    top_P=0.7,
+    top_K=20,
+    repetition_penalty=1.0,
+):
+    logits_warpers = []
+    if top_P is not None:
+        logits_warpers.append(TopPLogitsWarper(top_P, min_tokens_to_keep=3))
+    if top_K is not None:
+        logits_warpers.append(TopKLogitsWarper(top_K, min_tokens_to_keep=3))
+    logits_processors = []
+    if repetition_penalty is not None and repetition_penalty != 1:
+        logits_processors.append(
+            CustomRepetitionPenaltyLogitsProcessorRepeat(
+                repetition_penalty, num_code, 16
+            )
+        )
+    return logits_warpers, logits_processors

ChatTTS/model/speaker.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import lzma
+from typing import List, Optional, Union
+import pybase16384 as b14
+import numpy as np
+import torch
+import torch.nn.functional as F
+class Speaker:
+    def __init__(self, dim: int, spk_cfg: str, device=torch.device("cpu")) -> None:
+        spk_stat = torch.from_numpy(
+            np.frombuffer(b14.decode_from_string(spk_cfg), dtype=np.float16).copy()
+        ).to(device=device)
+        self.std, self.mean = spk_stat.requires_grad_(False).chunk(2)
+        self.dim = dim
+    def sample_random(self) -> str:
+        return self._encode(self._sample_random())
+    @torch.inference_mode()
+    def apply(
+        self,
+        emb: torch.Tensor,
+        spk_emb: Union[str, torch.Tensor],
+        input_ids: torch.Tensor,
+        spk_emb_ids: int,
+        device: torch.device,
+        inplace: bool = True,
+    ) -> torch.Tensor:
+        if isinstance(spk_emb, str):
+            spk_emb_tensor = torch.from_numpy(self._decode(spk_emb))
+        else:
+            spk_emb_tensor = spk_emb
+        n = (
+            F.normalize(
+                spk_emb_tensor,
+                p=2.0,
+                dim=0,
+                eps=1e-12,
+            )
+            .to(device)
+            .unsqueeze_(0)
+            .expand(emb.size(0), -1)
+            .unsqueeze_(1)
+            .expand(emb.shape)
+        )
+        cond = input_ids.narrow(-1, 0, 1).eq(spk_emb_ids).expand(emb.shape)
+        out = torch.where(cond, n, emb, out=emb if inplace else None)
+        if inplace:
+            del cond, n
+        return out
+    @staticmethod
+    @torch.no_grad()
+    def decorate_code_prompts(
+        text: List[str],
+        prompt: str,
+        txt_smp: Optional[str],
+        spk_emb: Optional[str],
+    ) -> List[str]:
+        for i, t in enumerate(text):
+            text[i] = (
+                t.replace("[Stts]", "")
+                .replace("[spk_emb]", "")
+                .replace("[empty_spk]", "")
+                .strip()
+            )
+            """
+            see https://github.com/2noise/ChatTTS/issues/459
+            """
+        if prompt:
+            text = [prompt + i for i in text]
+        txt_smp = "" if txt_smp is None else txt_smp
+        if spk_emb is not None:
+            text = [f"[Stts][spk_emb]{txt_smp}{i}[Ptts]" for i in text]
+        else:
+            text = [f"[Stts][empty_spk]{txt_smp}{i}[Ptts]" for i in text]
+        return text
+    @staticmethod
+    @torch.no_grad()
+    def decorate_text_prompts(text: List[str], prompt: str) -> List[str]:
+        return [f"[Sbreak]{i}[Pbreak]{prompt}" for i in text]
+    @staticmethod
+    @torch.no_grad()
+    def encode_prompt(prompt: torch.Tensor) -> str:
+        arr: np.ndarray = prompt.cpu().numpy().astype(np.uint16)
+        shp = arr.shape
+        assert len(shp) == 2, "prompt must be a 2D tensor"
+        s = b14.encode_to_string(
+            np.array(shp, dtype="<u2").tobytes()
+            + lzma.compress(
+                arr.astype("<u2").tobytes(),
+                format=lzma.FORMAT_RAW,
+                filters=[{"id": lzma.FILTER_LZMA2, "preset": 9 | lzma.PRESET_EXTREME}],
+            ),
+        )
+        del arr
+        return s
+    @staticmethod
+    @torch.no_grad()
+    def decode_prompt(prompt: str) -> torch.Tensor:
+        dec = b14.decode_from_string(prompt)
+        shp = np.frombuffer(dec[:4], dtype="<u2")
+        p = np.frombuffer(
+            lzma.decompress(
+                dec[4:],
+                format=lzma.FORMAT_RAW,
+                filters=[{"id": lzma.FILTER_LZMA2, "preset": 9 | lzma.PRESET_EXTREME}],
+            ),
+            dtype="<u2",
+        ).copy()
+        del dec
+        return torch.from_numpy(p.astype(np.int32)).view(*shp)
+    @torch.no_grad()
+    def _sample_random(self) -> torch.Tensor:
+        spk = (
+            torch.randn(self.dim, device=self.std.device, dtype=self.std.dtype)
+            .mul_(self.std)
+            .add_(self.mean)
+        )
+        return spk
+    @staticmethod
+    @torch.no_grad()
+    def _encode(spk_emb: torch.Tensor) -> str:
+        arr: np.ndarray = spk_emb.to(dtype=torch.float16, device="cpu").numpy()
+        s = b14.encode_to_string(
+            lzma.compress(
+                arr.tobytes(),
+                format=lzma.FORMAT_RAW,
+                filters=[{"id": lzma.FILTER_LZMA2, "preset": 9 | lzma.PRESET_EXTREME}],
+            ),
+        )
+        del arr
+        return s
+    @staticmethod
+    def _decode(spk_emb: str) -> np.ndarray:
+        return np.frombuffer(
+            lzma.decompress(
+                b14.decode_from_string(spk_emb),
+                format=lzma.FORMAT_RAW,
+                filters=[{"id": lzma.FILTER_LZMA2, "preset": 9 | lzma.PRESET_EXTREME}],
+            ),
+            dtype=np.float16,
+        ).copy()

ChatTTS/model/tokenizer.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+"""
+https://stackoverflow.com/questions/62691279/how-to-disable-tokenizers-parallelism-true-false-warning
+"""
+from typing import List, Tuple, Optional, Union
+import torch
+from transformers import BertTokenizerFast
+from ..utils import del_all
+class Tokenizer:
+    def __init__(
+        self,
+        tokenizer_path: torch.serialization.FILE_LIKE,
+    ):
+        """
+        tokenizer: BertTokenizerFast = torch.load(
+            tokenizer_path, map_location=device, mmap=True
+        )
+        # tokenizer.save_pretrained("asset/tokenizer", legacy_format=False)
+        """
+        tokenizer: BertTokenizerFast = BertTokenizerFast.from_pretrained(tokenizer_path)
+        self._tokenizer = tokenizer
+        self.len = len(tokenizer)
+        self.spk_emb_ids = tokenizer.convert_tokens_to_ids("[spk_emb]")
+        self.break_0_ids = tokenizer.convert_tokens_to_ids("[break_0]")
+        self.eos_token = tokenizer.convert_tokens_to_ids("[Ebreak]")
+    @torch.inference_mode()
+    def encode(
+        self,
+        text: List[str],
+        num_vq: int,
+        prompt: Optional[torch.Tensor] = None,
+        device="cpu",
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        input_ids_lst = []
+        attention_mask_lst = []
+        max_input_ids_len = -1
+        max_attention_mask_len = -1
+        prompt_size = 0
+        if prompt is not None:
+            assert prompt.size(0) == num_vq, "prompt dim 0 must equal to num_vq"
+            prompt_size = prompt.size(1)
+        # avoid random speaker embedding of tokenizer in the other dims
+        for t in text:
+            x = self._tokenizer.encode_plus(
+                t, return_tensors="pt", add_special_tokens=False, padding=True
+            )
+            input_ids_lst.append(x["input_ids"].squeeze_(0))
+            attention_mask_lst.append(x["attention_mask"].squeeze_(0))
+            del_all(x)
+            ids_sz = input_ids_lst[-1].size(0)
+            if ids_sz > max_input_ids_len:
+                max_input_ids_len = ids_sz
+            attn_sz = attention_mask_lst[-1].size(0)
+            if attn_sz > max_attention_mask_len:
+                max_attention_mask_len = attn_sz
+        if prompt is not None:
+            max_input_ids_len += prompt_size
+            max_attention_mask_len += prompt_size
+        input_ids = torch.zeros(
+            len(input_ids_lst),
+            max_input_ids_len,
+            device=device,
+            dtype=input_ids_lst[0].dtype,
+        )
+        for i in range(len(input_ids_lst)):
+            input_ids.narrow(0, i, 1).narrow(
+                1,
+                max_input_ids_len - prompt_size - input_ids_lst[i].size(0),
+                input_ids_lst[i].size(0),
+            ).copy_(
+                input_ids_lst[i]
+            )  # left padding
+        del_all(input_ids_lst)
+        attention_mask = torch.zeros(
+            len(attention_mask_lst),
+            max_attention_mask_len,
+            device=device,
+            dtype=attention_mask_lst[0].dtype,
+        )
+        for i in range(len(attention_mask_lst)):
+            attn = attention_mask.narrow(0, i, 1)
+            attn.narrow(
+                1,
+                max_attention_mask_len - prompt_size - attention_mask_lst[i].size(0),
+                attention_mask_lst[i].size(0),
+            ).copy_(
+                attention_mask_lst[i]
+            )  # left padding
+            if prompt_size > 0:
+                attn.narrow(
+                    1,
+                    max_attention_mask_len - prompt_size,
+                    prompt_size,
+                ).fill_(1)
+        del_all(attention_mask_lst)
+        text_mask = attention_mask.bool()
+        new_input_ids = input_ids.unsqueeze_(-1).expand(-1, -1, num_vq).clone()
+        del input_ids
+        if prompt_size > 0:
+            text_mask.narrow(1, max_input_ids_len - prompt_size, prompt_size).fill_(0)
+            prompt_t = prompt.t().unsqueeze_(0).expand(new_input_ids.size(0), -1, -1)
+            new_input_ids.narrow(
+                1,
+                max_input_ids_len - prompt_size,
+                prompt_size,
+            ).copy_(prompt_t)
+            del prompt_t
+        return new_input_ids, attention_mask, text_mask
+    @torch.inference_mode
+    def decode(
+        self,
+        sequences: Union[List[int], List[List[int]]],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = None,
+        **kwargs,
+    ):
+        return self._tokenizer.batch_decode(
+            sequences, skip_special_tokens, clean_up_tokenization_spaces, **kwargs
+        )

ChatTTS/model/velocity/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .llm import LLM
2	+ from .sampling_params import SamplingParams

ChatTTS/model/velocity/block_manager.py ADDED Viewed

	@@ -0,0 +1,296 @@

+"""A block manager that manages token blocks."""
+import enum
+from typing import Dict, List, Optional, Set, Tuple
+from vllm.block import PhysicalTokenBlock
+from .sequence import Sequence, SequenceGroup, SequenceStatus
+from vllm.utils import Device
+# Mapping: logical block number -> physical block.
+BlockTable = List[PhysicalTokenBlock]
+class BlockAllocator:
+    """Manages free physical token blocks for a device.
+    The allocator maintains a list of free blocks and allocates a block when
+    requested. When a block is freed, its reference count is decremented. If
+    the reference count becomes zero, the block is added back to the free list.
+    """
+    def __init__(
+        self,
+        device: Device,
+        block_size: int,
+        num_blocks: int,
+    ) -> None:
+        self.device = device
+        self.block_size = block_size
+        self.num_blocks = num_blocks
+        # Initialize the free blocks.
+        self.free_blocks: BlockTable = []
+        for i in range(num_blocks):
+            block = PhysicalTokenBlock(
+                device=device, block_number=i, block_size=block_size
+            )
+            self.free_blocks.append(block)
+    def allocate(self) -> PhysicalTokenBlock:
+        if not self.free_blocks:
+            raise ValueError("Out of memory! No free blocks are available.")
+        block = self.free_blocks.pop()
+        block.ref_count = 1
+        return block
+    def free(self, block: PhysicalTokenBlock) -> None:
+        if block.ref_count == 0:
+            raise ValueError(f"Double free! {block} is already freed.")
+        block.ref_count -= 1
+        if block.ref_count == 0:
+            self.free_blocks.append(block)
+    def get_num_free_blocks(self) -> int:
+        return len(self.free_blocks)
+class AllocStatus(enum.Enum):
+    """Result for BlockSpaceManager.can_allocate
+    1. Ok: seq_group can be allocated now.
+    2. Later: seq_group cannot be allocated.
+      The capacity of allocator is larger than seq_group required.
+    3. Never: seq_group can never be allocated.
+      The seq_group is too large to allocated in GPU.
+    """
+    OK = enum.auto()
+    LATER = enum.auto()
+    NEVER = enum.auto()
+class BlockSpaceManager:
+    """Manages the mapping between logical and physical token blocks."""
+    def __init__(
+        self,
+        block_size: int,
+        num_gpu_blocks: int,
+        num_cpu_blocks: int,
+        watermark: float = 0.01,
+        sliding_window: Optional[int] = None,
+    ) -> None:
+        self.block_size = block_size
+        self.num_total_gpu_blocks = num_gpu_blocks
+        self.num_total_cpu_blocks = num_cpu_blocks
+        self.block_sliding_window = None
+        if sliding_window is not None:
+            assert sliding_window % block_size == 0, (sliding_window, block_size)
+            self.block_sliding_window = sliding_window // block_size
+        self.watermark = watermark
+        assert watermark >= 0.0
+        self.watermark_blocks = int(watermark * num_gpu_blocks)
+        self.gpu_allocator = BlockAllocator(Device.GPU, block_size, num_gpu_blocks)
+        self.cpu_allocator = BlockAllocator(Device.CPU, block_size, num_cpu_blocks)
+        # Mapping: seq_id -> BlockTable.
+        self.block_tables: Dict[int, BlockTable] = {}
+    def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
+        # FIXME(woosuk): Here we assume that all sequences in the group share
+        # the same prompt. This may not be true for preempted sequences.
+        seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
+        num_required_blocks = len(seq.logical_token_blocks)
+        if self.block_sliding_window is not None:
+            num_required_blocks = min(num_required_blocks, self.block_sliding_window)
+        num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks()
+        # Use watermark to avoid frequent cache eviction.
+        if self.num_total_gpu_blocks - num_required_blocks < self.watermark_blocks:
+            return AllocStatus.NEVER
+        if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks:
+            return AllocStatus.OK
+        else:
+            return AllocStatus.LATER
+    def allocate(self, seq_group: SequenceGroup) -> None:
+        # NOTE: Here we assume that all sequences in the group have the same
+        # prompt.
+        seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
+        # Allocate new physical token blocks that will store the prompt tokens.
+        block_table: BlockTable = []
+        for logical_idx in range(len(seq.logical_token_blocks)):
+            if (
+                self.block_sliding_window is not None
+                and logical_idx >= self.block_sliding_window
+            ):
+                block = block_table[logical_idx % self.block_sliding_window]
+            else:
+                block = self.gpu_allocator.allocate()
+            # Set the reference counts of the token blocks.
+            block.ref_count = seq_group.num_seqs()
+            block_table.append(block)
+        # Assign the block table for each sequence.
+        for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
+            self.block_tables[seq.seq_id] = block_table.copy()
+    def can_append_slot(self, seq_group: SequenceGroup) -> bool:
+        # Simple heuristic: If there is at least one free block
+        # for each sequence, we can append.
+        num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks()
+        num_seqs = seq_group.num_seqs(status=SequenceStatus.RUNNING)
+        return num_seqs <= num_free_gpu_blocks
+    def append_slot(self, seq: Sequence) -> Optional[Tuple[int, int]]:
+        """Allocate a physical slot for a new token."""
+        logical_blocks = seq.logical_token_blocks
+        block_table = self.block_tables[seq.seq_id]
+        if len(block_table) < len(logical_blocks):
+            if (
+                self.block_sliding_window
+                and len(block_table) >= self.block_sliding_window
+            ):
+                # re-use a block
+                block_table.append(
+                    block_table[len(block_table) % self.block_sliding_window]
+                )
+            else:
+                # The sequence has a new logical block.
+                # Allocate a new physical block.
+                block = self.gpu_allocator.allocate()
+                block_table.append(block)
+                return None
+        # We want to append the token to the last physical block.
+        last_block = block_table[-1]
+        assert last_block.device == Device.GPU
+        if last_block.ref_count == 1:
+            # Not shared with other sequences. Appendable.
+            return None
+        else:
+            # The last block is shared with other sequences.
+            # Copy on Write: Allocate a new block and copy the tokens.
+            new_block = self.gpu_allocator.allocate()
+            block_table[-1] = new_block
+            self.gpu_allocator.free(last_block)
+            return last_block.block_number, new_block.block_number
+    def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
+        # NOTE: fork does not allocate a new physical block.
+        # Thus, it is always safe from OOM.
+        src_block_table = self.block_tables[parent_seq.seq_id]
+        self.block_tables[child_seq.seq_id] = src_block_table.copy()
+        for block in src_block_table:
+            block.ref_count += 1
+    def _get_physical_blocks(
+        self, seq_group: SequenceGroup
+    ) -> List[PhysicalTokenBlock]:
+        # NOTE: Here, we assume that the physical blocks are only shared by
+        # the sequences in the same group.
+        blocks: Set[PhysicalTokenBlock] = set()
+        for seq in seq_group.get_seqs():
+            if seq.is_finished():
+                continue
+            blocks.update(self.block_tables[seq.seq_id])
+        return list(blocks)
+    def can_swap_in(self, seq_group: SequenceGroup) -> bool:
+        blocks = self._get_physical_blocks(seq_group)
+        num_swapped_seqs = seq_group.num_seqs(status=SequenceStatus.SWAPPED)
+        num_free_blocks = self.gpu_allocator.get_num_free_blocks()
+        # NOTE: Conservatively, we assume that every sequence will allocate
+        # at least one free block right after the swap-in.
+        # NOTE: This should match the logic in can_append_slot().
+        num_required_blocks = len(blocks) + num_swapped_seqs
+        return num_free_blocks - num_required_blocks >= self.watermark_blocks
+    def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]:
+        # CPU block -> GPU block.
+        mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {}
+        for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
+            new_block_table: BlockTable = []
+            block_table = self.block_tables[seq.seq_id]
+            for cpu_block in block_table:
+                if cpu_block in mapping:
+                    gpu_block = mapping[cpu_block]
+                    gpu_block.ref_count += 1
+                else:
+                    gpu_block = self.gpu_allocator.allocate()
+                    mapping[cpu_block] = gpu_block
+                new_block_table.append(gpu_block)
+                # Free the CPU block swapped in to GPU.
+                self.cpu_allocator.free(cpu_block)
+            self.block_tables[seq.seq_id] = new_block_table
+        block_number_mapping = {
+            cpu_block.block_number: gpu_block.block_number
+            for cpu_block, gpu_block in mapping.items()
+        }
+        return block_number_mapping
+    def can_swap_out(self, seq_group: SequenceGroup) -> bool:
+        blocks = self._get_physical_blocks(seq_group)
+        return len(blocks) <= self.cpu_allocator.get_num_free_blocks()
+    def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
+        # GPU block -> CPU block.
+        mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {}
+        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+            new_block_table: BlockTable = []
+            block_table = self.block_tables[seq.seq_id]
+            for gpu_block in block_table:
+                if gpu_block in mapping:
+                    cpu_block = mapping[gpu_block]
+                    cpu_block.ref_count += 1
+                else:
+                    cpu_block = self.cpu_allocator.allocate()
+                    mapping[gpu_block] = cpu_block
+                new_block_table.append(cpu_block)
+                # Free the GPU block swapped out to CPU.
+                self.gpu_allocator.free(gpu_block)
+            self.block_tables[seq.seq_id] = new_block_table
+        block_number_mapping = {
+            gpu_block.block_number: cpu_block.block_number
+            for gpu_block, cpu_block in mapping.items()
+        }
+        return block_number_mapping
+    def _free_block_table(self, block_table: BlockTable) -> None:
+        for block in set(block_table):
+            if block.device == Device.GPU:
+                self.gpu_allocator.free(block)
+            else:
+                self.cpu_allocator.free(block)
+    def free(self, seq: Sequence) -> None:
+        if seq.seq_id not in self.block_tables:
+            # Already freed or haven't been scheduled yet.
+            return
+        block_table = self.block_tables[seq.seq_id]
+        self._free_block_table(block_table)
+        del self.block_tables[seq.seq_id]
+    def reset(self) -> None:
+        for block_table in self.block_tables.values():
+            self._free_block_table(block_table)
+        self.block_tables.clear()
+    def get_block_table(self, seq: Sequence) -> List[int]:
+        block_table = self.block_tables[seq.seq_id]
+        return [block.block_number for block in block_table]
+    def get_num_free_gpu_blocks(self) -> int:
+        return self.gpu_allocator.get_num_free_blocks()
+    def get_num_free_cpu_blocks(self) -> int:
+        return self.cpu_allocator.get_num_free_blocks()

ChatTTS/model/velocity/configs.py ADDED Viewed

	@@ -0,0 +1,865 @@

+from typing import Optional, Union, Tuple
+import os
+import torch
+from transformers import PretrainedConfig
+from vllm.logger import init_logger
+from vllm.transformers_utils.config import get_config
+from vllm.utils import get_cpu_memory, is_hip
+import argparse
+import dataclasses
+from dataclasses import dataclass
+logger = init_logger(__name__)
+_GB = 1 << 30
+class ModelConfig:
+    """Configuration for the model.
+    Args:
+        model: Name or path of the huggingface model to use.
+        tokenizer: Name or path of the huggingface tokenizer to use.
+        tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
+            available, and "slow" will always use the slow tokenizer.
+        trust_remote_code: Trust remote code (e.g., from HuggingFace) when
+            downloading the model and tokenizer.
+        download_dir: Directory to download and load the weights, default to the
+            default cache directory of huggingface.
+        load_format: The format of the model weights to load:
+            "auto" will try to load the weights in the safetensors format and
+                fall back to the pytorch bin format if safetensors format is
+                not available.
+            "pt" will load the weights in the pytorch bin format.
+            "safetensors" will load the weights in the safetensors format.
+            "npcache" will load the weights in pytorch format and store
+                a numpy cache to speed up the loading.
+            "dummy" will initialize the weights with random values, which is
+                mainly for profiling.
+        dtype: Data type for model weights and activations. The "auto" option
+            will use FP16 precision for FP32 and FP16 models, and BF16 precision
+            for BF16 models.
+        seed: Random seed for reproducibility.
+        revision: The specific model version to use. It can be a branch name,
+            a tag name, or a commit id. If unspecified, will use the default
+            version.
+        tokenizer_revision: The specific tokenizer version to use. It can be a
+            branch name, a tag name, or a commit id. If unspecified, will use
+            the default version.
+        max_model_len: Maximum length of a sequence (including prompt and
+            output). If None, will be derived from the model.
+        quantization: Quantization method that was used to quantize the model
+            weights. If None, we assume the model weights are not quantized.
+        enforce_eager: Whether to enforce eager execution. If True, we will
+            disable CUDA graph and always execute the model in eager mode.
+            If False, we will use CUDA graph and eager execution in hybrid.
+        max_context_len_to_capture: Maximum context len covered by CUDA graphs.
+            When a sequence has context length larger than this, we fall back
+            to eager mode.
+    """
+    def __init__(
+        self,
+        model: str,
+        tokenizer: str,
+        tokenizer_mode: str,
+        trust_remote_code: bool,
+        download_dir: Optional[str],
+        load_format: str,
+        dtype: Union[str, torch.dtype],
+        seed: int,
+        revision: Optional[str] = None,
+        tokenizer_revision: Optional[str] = None,
+        max_model_len: Optional[int] = None,
+        quantization: Optional[str] = None,
+        enforce_eager: bool = False,
+        max_context_len_to_capture: Optional[int] = None,
+        num_audio_tokens: int = 1024,
+        num_text_tokens: int = 80,
+    ) -> None:
+        self.model = model
+        self.tokenizer = tokenizer
+        self.tokenizer_mode = tokenizer_mode
+        self.trust_remote_code = trust_remote_code
+        self.download_dir = download_dir
+        self.load_format = load_format
+        self.seed = seed
+        self.revision = revision
+        self.tokenizer_revision = tokenizer_revision
+        self.quantization = quantization
+        self.enforce_eager = enforce_eager
+        self.max_context_len_to_capture = max_context_len_to_capture
+        self.num_audio_tokens = num_audio_tokens
+        self.num_text_tokens = num_text_tokens
+        if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true":
+            # download model from ModelScope hub,
+            # lazy import so that modelscope is not required for normal use.
+            from modelscope.hub.snapshot_download import (
+                snapshot_download,
+            )  # pylint: disable=C
+            model_path = snapshot_download(
+                model_id=model, cache_dir=download_dir, revision=revision
+            )
+            self.model = model_path
+            self.download_dir = model_path
+            self.tokenizer = model_path
+        self.hf_config = get_config(self.model, trust_remote_code, revision)
+        self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
+        self.max_model_len = _get_and_verify_max_len(self.hf_config, max_model_len)
+        self._verify_load_format()
+        self._verify_tokenizer_mode()
+        self._verify_quantization()
+        self._verify_cuda_graph()
+    def _verify_load_format(self) -> None:
+        load_format = self.load_format.lower()
+        supported_load_format = ["auto", "pt", "safetensors", "npcache", "dummy"]
+        rocm_not_supported_load_format = []
+        if load_format not in supported_load_format:
+            raise ValueError(
+                f"Unknown load format: {self.load_format}. Must be one of "
+                "'auto', 'pt', 'safetensors', 'npcache', or 'dummy'."
+            )
+        if is_hip() and load_format in rocm_not_supported_load_format:
+            rocm_supported_load_format = [
+                f
+                for f in supported_load_format
+                if (f not in rocm_not_supported_load_format)
+            ]
+            raise ValueError(
+                f"load format '{load_format}' is not supported in ROCm. "
+                f"Supported load format are "
+                f"{rocm_supported_load_format}"
+            )
+        # TODO: Remove this check once HF updates the pt weights of Mixtral.
+        architectures = getattr(self.hf_config, "architectures", [])
+        if "MixtralForCausalLM" in architectures and load_format == "pt":
+            raise ValueError(
+                "Currently, the 'pt' format is not supported for Mixtral. "
+                "Please use the 'safetensors' format instead. "
+            )
+        self.load_format = load_format
+    def _verify_tokenizer_mode(self) -> None:
+        tokenizer_mode = self.tokenizer_mode.lower()
+        if tokenizer_mode not in ["auto", "slow"]:
+            raise ValueError(
+                f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
+                "either 'auto' or 'slow'."
+            )
+        self.tokenizer_mode = tokenizer_mode
+    def _verify_quantization(self) -> None:
+        supported_quantization = ["awq", "gptq", "squeezellm"]
+        rocm_not_supported_quantization = ["awq"]
+        if self.quantization is not None:
+            self.quantization = self.quantization.lower()
+        # Parse quantization method from the HF model config, if available.
+        hf_quant_config = getattr(self.hf_config, "quantization_config", None)
+        if hf_quant_config is not None:
+            hf_quant_method = str(hf_quant_config["quant_method"]).lower()
+            if self.quantization is None:
+                self.quantization = hf_quant_method
+            elif self.quantization != hf_quant_method:
+                raise ValueError(
+                    "Quantization method specified in the model config "
+                    f"({hf_quant_method}) does not match the quantization "
+                    f"method specified in the `quantization` argument "
+                    f"({self.quantization})."
+                )
+        if self.quantization is not None:
+            if self.quantization not in supported_quantization:
+                raise ValueError(
+                    f"Unknown quantization method: {self.quantization}. Must "
+                    f"be one of {supported_quantization}."
+                )
+            if is_hip() and self.quantization in rocm_not_supported_quantization:
+                raise ValueError(
+                    f"{self.quantization} quantization is currently not supported "
+                    f"in ROCm."
+                )
+            logger.warning(
+                f"{self.quantization} quantization is not fully "
+                "optimized yet. The speed can be slower than "
+                "non-quantized models."
+            )
+    def _verify_cuda_graph(self) -> None:
+        if self.max_context_len_to_capture is None:
+            self.max_context_len_to_capture = self.max_model_len
+        self.max_context_len_to_capture = min(
+            self.max_context_len_to_capture, self.max_model_len
+        )
+    def verify_with_parallel_config(
+        self,
+        parallel_config: "ParallelConfig",
+    ) -> None:
+        total_num_attention_heads = self.hf_config.num_attention_heads
+        tensor_parallel_size = parallel_config.tensor_parallel_size
+        if total_num_attention_heads % tensor_parallel_size != 0:
+            raise ValueError(
+                f"Total number of attention heads ({total_num_attention_heads})"
+                " must be divisible by tensor parallel size "
+                f"({tensor_parallel_size})."
+            )
+        total_num_hidden_layers = self.hf_config.num_hidden_layers
+        pipeline_parallel_size = parallel_config.pipeline_parallel_size
+        if total_num_hidden_layers % pipeline_parallel_size != 0:
+            raise ValueError(
+                f"Total number of hidden layers ({total_num_hidden_layers}) "
+                "must be divisible by pipeline parallel size "
+                f"({pipeline_parallel_size})."
+            )
+    def get_sliding_window(self) -> Optional[int]:
+        return getattr(self.hf_config, "sliding_window", None)
+    def get_vocab_size(self) -> int:
+        return self.hf_config.vocab_size
+    def get_hidden_size(self) -> int:
+        return self.hf_config.hidden_size
+    def get_head_size(self) -> int:
+        # FIXME(woosuk): This may not be true for all models.
+        return self.hf_config.hidden_size // self.hf_config.num_attention_heads
+    def get_total_num_kv_heads(self) -> int:
+        """Returns the total number of KV heads."""
+        # For GPTBigCode & Falcon:
+        # NOTE: for falcon, when new_decoder_architecture is True, the
+        # multi_query flag is ignored and we use n_head_kv for the number of
+        # KV heads.
+        falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"]
+        new_decoder_arch_falcon = (
+            self.hf_config.model_type in falcon_model_types
+            and getattr(self.hf_config, "new_decoder_architecture", False)
+        )
+        if not new_decoder_arch_falcon and getattr(
+            self.hf_config, "multi_query", False
+        ):
+            # Multi-query attention, only one KV head.
+            # Currently, tensor parallelism is not supported in this case.
+            return 1
+        attributes = [
+            # For Falcon:
+            "n_head_kv",
+            "num_kv_heads",
+            # For LLaMA-2:
+            "num_key_value_heads",
+            # For ChatGLM:
+            "multi_query_group_num",
+        ]
+        for attr in attributes:
+            num_kv_heads = getattr(self.hf_config, attr, None)
+            if num_kv_heads is not None:
+                return num_kv_heads
+        # For non-grouped-query attention models, the number of KV heads is
+        # equal to the number of attention heads.
+        return self.hf_config.num_attention_heads
+    def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
+        """Returns the number of KV heads per GPU."""
+        total_num_kv_heads = self.get_total_num_kv_heads()
+        # If tensor parallelism is used, we divide the number of KV heads by
+        # the tensor parallel size. We will replicate the KV heads in the
+        # case where the number of KV heads is smaller than the tensor
+        # parallel size so each GPU has at least one KV head.
+        return max(1, total_num_kv_heads // parallel_config.tensor_parallel_size)
+    def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
+        total_num_hidden_layers = self.hf_config.num_hidden_layers
+        return total_num_hidden_layers // parallel_config.pipeline_parallel_size
+class CacheConfig:
+    """Configuration for the KV cache.
+    Args:
+        block_size: Size of a cache block in number of tokens.
+        gpu_memory_utilization: Fraction of GPU memory to use for the
+            vLLM execution.
+        swap_space: Size of the CPU swap space per GPU (in GiB).
+    """
+    def __init__(
+        self,
+        block_size: int,
+        gpu_memory_utilization: float,
+        swap_space: int,
+        sliding_window: Optional[int] = None,
+    ) -> None:
+        self.block_size = block_size
+        self.gpu_memory_utilization = gpu_memory_utilization
+        self.swap_space_bytes = swap_space * _GB
+        self.sliding_window = sliding_window
+        self._verify_args()
+        # Will be set after profiling.
+        self.num_gpu_blocks = None
+        self.num_cpu_blocks = None
+    def _verify_args(self) -> None:
+        if self.gpu_memory_utilization > 1.0:
+            raise ValueError(
+                "GPU memory utilization must be less than 1.0. Got "
+                f"{self.gpu_memory_utilization}."
+            )
+    def verify_with_parallel_config(
+        self,
+        parallel_config: "ParallelConfig",
+    ) -> None:
+        total_cpu_memory = get_cpu_memory()
+        # FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
+        # group are in the same node. However, the GPUs may span multiple nodes.
+        num_gpus_per_node = parallel_config.tensor_parallel_size
+        cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node
+        msg = (
+            f"{cpu_memory_usage / _GB:.2f} GiB out of "
+            f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is "
+            "allocated for the swap space."
+        )
+        if cpu_memory_usage > 0.7 * total_cpu_memory:
+            raise ValueError("Too large swap space. " + msg)
+        elif cpu_memory_usage > 0.4 * total_cpu_memory:
+            logger.warning("Possibly too large swap space. " + msg)
+class ParallelConfig:
+    """Configuration for the distributed execution.
+    Args:
+        pipeline_parallel_size: Number of pipeline parallel groups.
+        tensor_parallel_size: Number of tensor parallel groups.
+        worker_use_ray: Whether to use Ray for model workers. Will be set to
+            True if either pipeline_parallel_size or tensor_parallel_size is
+            greater than 1.
+    """
+    def __init__(
+        self,
+        pipeline_parallel_size: int,
+        tensor_parallel_size: int,
+        worker_use_ray: bool,
+        max_parallel_loading_workers: Optional[int] = None,
+    ) -> None:
+        self.pipeline_parallel_size = pipeline_parallel_size
+        self.tensor_parallel_size = tensor_parallel_size
+        self.worker_use_ray = worker_use_ray
+        self.max_parallel_loading_workers = max_parallel_loading_workers
+        self.world_size = pipeline_parallel_size * tensor_parallel_size
+        if self.world_size > 1:
+            self.worker_use_ray = True
+        self._verify_args()
+    def _verify_args(self) -> None:
+        if self.pipeline_parallel_size > 1:
+            raise NotImplementedError("Pipeline parallelism is not supported yet.")
+class SchedulerConfig:
+    """Scheduler configuration.
+    Args:
+        max_num_batched_tokens: Maximum number of tokens to be processed in
+            a single iteration.
+        max_num_seqs: Maximum number of sequences to be processed in a single
+            iteration.
+        max_model_len: Maximum length of a sequence (including prompt
+            and generated text).
+        max_paddings: Maximum number of paddings to be added to a batch.
+    """
+    def __init__(
+        self,
+        max_num_batched_tokens: Optional[int],
+        max_num_seqs: int,
+        max_model_len: int,
+        max_paddings: int,
+    ) -> None:
+        if max_num_batched_tokens is not None:
+            self.max_num_batched_tokens = max_num_batched_tokens
+        else:
+            # If max_model_len is too short, use 2048 as the default value for
+            # higher throughput.
+            self.max_num_batched_tokens = max(max_model_len, 2048)
+        self.max_num_seqs = max_num_seqs
+        self.max_model_len = max_model_len
+        self.max_paddings = max_paddings
+        self._verify_args()
+    def _verify_args(self) -> None:
+        if self.max_num_batched_tokens < self.max_model_len:
+            raise ValueError(
+                f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
+                f"smaller than max_model_len ({self.max_model_len}). "
+                "This effectively limits the maximum sequence length to "
+                "max_num_batched_tokens and makes vLLM reject longer "
+                "sequences. Please increase max_num_batched_tokens or "
+                "decrease max_model_len."
+            )
+        if self.max_num_batched_tokens < self.max_num_seqs:
+            raise ValueError(
+                f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
+                "be greater than or equal to max_num_seqs "
+                f"({self.max_num_seqs})."
+            )
+_STR_DTYPE_TO_TORCH_DTYPE = {
+    "half": torch.float16,
+    "float16": torch.float16,
+    "float": torch.float32,
+    "float32": torch.float32,
+    "bfloat16": torch.bfloat16,
+}
+_ROCM_NOT_SUPPORTED_DTYPE = ["float", "float32"]
+def _get_and_verify_dtype(
+    config: PretrainedConfig,
+    dtype: Union[str, torch.dtype],
+) -> torch.dtype:
+    # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
+    # because config.torch_dtype can be None.
+    config_dtype = getattr(config, "torch_dtype", None)
+    if config_dtype is None:
+        config_dtype = torch.float32
+    if isinstance(dtype, str):
+        dtype = dtype.lower()
+        if dtype == "auto":
+            if config_dtype == torch.float32:
+                # Following the common practice, we use float16 for float32
+                # models.
+                torch_dtype = torch.float16
+            else:
+                torch_dtype = config_dtype
+        else:
+            if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
+                raise ValueError(f"Unknown dtype: {dtype}")
+            torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
+    elif isinstance(dtype, torch.dtype):
+        torch_dtype = dtype
+    else:
+        raise ValueError(f"Unknown dtype: {dtype}")
+    if is_hip() and torch_dtype == torch.float32:
+        rocm_supported_dtypes = [
+            k
+            for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items()
+            if (k not in _ROCM_NOT_SUPPORTED_DTYPE)
+        ]
+        raise ValueError(
+            f"dtype '{dtype}' is not supported in ROCm. "
+            f"Supported dtypes are {rocm_supported_dtypes}"
+        )
+    # Verify the dtype.
+    if torch_dtype != config_dtype:
+        if torch_dtype == torch.float32:
+            # Upcasting to float32 is allowed.
+            pass
+        elif config_dtype == torch.float32:
+            # Downcasting from float32 to float16 or bfloat16 is allowed.
+            pass
+        else:
+            # Casting between float16 and bfloat16 is allowed with a warning.
+            logger.warning(f"Casting {config_dtype} to {torch_dtype}.")
+    return torch_dtype
+def _get_and_verify_max_len(
+    hf_config: PretrainedConfig,
+    max_model_len: Optional[int],
+) -> int:
+    """Get and verify the model's maximum length."""
+    derived_max_model_len = float("inf")
+    possible_keys = [
+        # OPT
+        "max_position_embeddings",
+        # GPT-2
+        "n_positions",
+        # MPT
+        "max_seq_len",
+        # ChatGLM2
+        "seq_length",
+        # Others
+        "max_sequence_length",
+        "max_seq_length",
+        "seq_len",
+    ]
+    for key in possible_keys:
+        max_len_key = getattr(hf_config, key, None)
+        if max_len_key is not None:
+            derived_max_model_len = min(derived_max_model_len, max_len_key)
+    if derived_max_model_len == float("inf"):
+        if max_model_len is not None:
+            # If max_model_len is specified, we use it.
+            return max_model_len
+        default_max_len = 2048
+        logger.warning(
+            "The model's config.json does not contain any of the following "
+            "keys to determine the original maximum length of the model: "
+            f"{possible_keys}. Assuming the model's maximum length is "
+            f"{default_max_len}."
+        )
+        derived_max_model_len = default_max_len
+    rope_scaling = getattr(hf_config, "rope_scaling", None)
+    if rope_scaling is not None:
+        assert "factor" in rope_scaling
+        scaling_factor = rope_scaling["factor"]
+        if rope_scaling["type"] == "yarn":
+            derived_max_model_len = rope_scaling["original_max_position_embeddings"]
+        derived_max_model_len *= scaling_factor
+    if max_model_len is None:
+        max_model_len = derived_max_model_len
+    elif max_model_len > derived_max_model_len:
+        raise ValueError(
+            f"User-specified max_model_len ({max_model_len}) is greater than "
+            f"the derived max_model_len ({max_len_key}={derived_max_model_len}"
+            " in model's config.json). This may lead to incorrect model "
+            "outputs or CUDA errors. Make sure the value is correct and "
+            "within the model context size."
+        )
+    return int(max_model_len)
+@dataclass
+class EngineArgs:
+    """Arguments for vLLM engine."""
+    model: str
+    tokenizer: Optional[str] = None
+    tokenizer_mode: str = "auto"
+    trust_remote_code: bool = False
+    download_dir: Optional[str] = None
+    load_format: str = "auto"
+    dtype: str = "auto"
+    seed: int = 0
+    max_model_len: Optional[int] = None
+    worker_use_ray: bool = False
+    pipeline_parallel_size: int = 1
+    tensor_parallel_size: int = 1
+    max_parallel_loading_workers: Optional[int] = None
+    block_size: int = 16
+    swap_space: int = 4  # GiB
+    gpu_memory_utilization: float = 0.90
+    max_num_batched_tokens: Optional[int] = None
+    max_num_seqs: int = 256
+    max_paddings: int = 256
+    disable_log_stats: bool = False
+    revision: Optional[str] = None
+    tokenizer_revision: Optional[str] = None
+    quantization: Optional[str] = None
+    enforce_eager: bool = False
+    max_context_len_to_capture: int = 8192
+    num_audio_tokens: int = 1024
+    num_text_tokens: int = 80
+    def __post_init__(self):
+        if self.tokenizer is None:
+            self.tokenizer = self.model
+    @staticmethod
+    def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+        """Shared CLI arguments for vLLM engine."""
+        # NOTE: If you update any of the arguments below, please also
+        # make sure to update docs/source/models/engine_args.rst
+        # Model arguments
+        parser.add_argument(
+            "--model",
+            type=str,
+            default="facebook/opt-125m",
+            help="name or path of the huggingface model to use",
+        )
+        parser.add_argument(
+            "--tokenizer",
+            type=str,
+            default=EngineArgs.tokenizer,
+            help="name or path of the huggingface tokenizer to use",
+        )
+        parser.add_argument(
+            "--revision",
+            type=str,
+            default=None,
+            help="the specific model version to use. It can be a branch "
+            "name, a tag name, or a commit id. If unspecified, will use "
+            "the default version.",
+        )
+        parser.add_argument(
+            "--tokenizer-revision",
+            type=str,
+            default=None,
+            help="the specific tokenizer version to use. It can be a branch "
+            "name, a tag name, or a commit id. If unspecified, will use "
+            "the default version.",
+        )
+        parser.add_argument(
+            "--tokenizer-mode",
+            type=str,
+            default=EngineArgs.tokenizer_mode,
+            choices=["auto", "slow"],
+            help='tokenizer mode. "auto" will use the fast '
+            'tokenizer if available, and "slow" will '
+            "always use the slow tokenizer.",
+        )
+        parser.add_argument(
+            "--trust-remote-code",
+            action="store_true",
+            help="trust remote code from huggingface",
+        )
+        parser.add_argument(
+            "--download-dir",
+            type=str,
+            default=EngineArgs.download_dir,
+            help="directory to download and load the weights, "
+            "default to the default cache dir of "
+            "huggingface",
+        )
+        parser.add_argument(
+            "--load-format",
+            type=str,
+            default=EngineArgs.load_format,
+            choices=["auto", "pt", "safetensors", "npcache", "dummy"],
+            help="The format of the model weights to load. "
+            '"auto" will try to load the weights in the safetensors format '
+            "and fall back to the pytorch bin format if safetensors format "
+            "is not available. "
+            '"pt" will load the weights in the pytorch bin format. '
+            '"safetensors" will load the weights in the safetensors format. '
+            '"npcache" will load the weights in pytorch format and store '
+            "a numpy cache to speed up the loading. "
+            '"dummy" will initialize the weights with random values, '
+            "which is mainly for profiling.",
+        )
+        parser.add_argument(
+            "--dtype",
+            type=str,
+            default=EngineArgs.dtype,
+            choices=["auto", "half", "float16", "bfloat16", "float", "float32"],
+            help="data type for model weights and activations. "
+            'The "auto" option will use FP16 precision '
+            "for FP32 and FP16 models, and BF16 precision "
+            "for BF16 models.",
+        )
+        parser.add_argument(
+            "--max-model-len",
+            type=int,
+            default=None,
+            help="model context length. If unspecified, "
+            "will be automatically derived from the model.",
+        )
+        # Parallel arguments
+        parser.add_argument(
+            "--worker-use-ray",
+            action="store_true",
+            help="use Ray for distributed serving, will be "
+            "automatically set when using more than 1 GPU",
+        )
+        parser.add_argument(
+            "--pipeline-parallel-size",
+            "-pp",
+            type=int,
+            default=EngineArgs.pipeline_parallel_size,
+            help="number of pipeline stages",
+        )
+        parser.add_argument(
+            "--tensor-parallel-size",
+            "-tp",
+            type=int,
+            default=EngineArgs.tensor_parallel_size,
+            help="number of tensor parallel replicas",
+        )
+        parser.add_argument(
+            "--max-parallel-loading-workers",
+            type=int,
+            help="load model sequentially in multiple batches, "
+            "to avoid RAM OOM when using tensor "
+            "parallel and large models",
+        )
+        # KV cache arguments
+        parser.add_argument(
+            "--block-size",
+            type=int,
+            default=EngineArgs.block_size,
+            choices=[8, 16, 32],
+            help="token block size",
+        )
+        # TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
+        parser.add_argument(
+            "--seed", type=int, default=EngineArgs.seed, help="random seed"
+        )
+        parser.add_argument(
+            "--swap-space",
+            type=int,
+            default=EngineArgs.swap_space,
+            help="CPU swap space size (GiB) per GPU",
+        )
+        parser.add_argument(
+            "--gpu-memory-utilization",
+            type=float,
+            default=EngineArgs.gpu_memory_utilization,
+            help="the fraction of GPU memory to be used for "
+            "the model executor, which can range from 0 to 1."
+            "If unspecified, will use the default value of 0.9.",
+        )
+        parser.add_argument(
+            "--max-num-batched-tokens",
+            type=int,
+            default=EngineArgs.max_num_batched_tokens,
+            help="maximum number of batched tokens per " "iteration",
+        )
+        parser.add_argument(
+            "--max-num-seqs",
+            type=int,
+            default=EngineArgs.max_num_seqs,
+            help="maximum number of sequences per iteration",
+        )
+        parser.add_argument(
+            "--max-paddings",
+            type=int,
+            default=EngineArgs.max_paddings,
+            help="maximum number of paddings in a batch",
+        )
+        parser.add_argument(
+            "--disable-log-stats",
+            action="store_true",
+            help="disable logging statistics",
+        )
+        # Quantization settings.
+        parser.add_argument(
+            "--quantization",
+            "-q",
+            type=str,
+            choices=["awq", "gptq", "squeezellm", None],
+            default=None,
+            help="Method used to quantize the weights. If "
+            "None, we first check the `quantization_config` "
+            "attribute in the model config file. If that is "
+            "None, we assume the model weights are not "
+            "quantized and use `dtype` to determine the data "
+            "type of the weights.",
+        )
+        parser.add_argument(
+            "--enforce-eager",
+            action="store_true",
+            help="Always use eager-mode PyTorch. If False, "
+            "will use eager mode and CUDA graph in hybrid "
+            "for maximal performance and flexibility.",
+        )
+        parser.add_argument(
+            "--max-context-len-to-capture",
+            type=int,
+            default=EngineArgs.max_context_len_to_capture,
+            help="maximum context length covered by CUDA "
+            "graphs. When a sequence has context length "
+            "larger than this, we fall back to eager mode.",
+        )
+        return parser
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace) -> "EngineArgs":
+        # Get the list of attributes of this dataclass.
+        attrs = [attr.name for attr in dataclasses.fields(cls)]
+        # Set the attributes from the parsed arguments.
+        engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
+        return engine_args
+    def create_engine_configs(
+        self,
+    ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig]:
+        model_config = ModelConfig(
+            self.model,
+            self.tokenizer,
+            self.tokenizer_mode,
+            self.trust_remote_code,
+            self.download_dir,
+            self.load_format,
+            self.dtype,
+            self.seed,
+            self.revision,
+            self.tokenizer_revision,
+            self.max_model_len,
+            self.quantization,
+            self.enforce_eager,
+            self.max_context_len_to_capture,
+            self.num_audio_tokens,
+            self.num_text_tokens,
+        )
+        cache_config = CacheConfig(
+            self.block_size,
+            self.gpu_memory_utilization,
+            self.swap_space,
+            model_config.get_sliding_window(),
+        )
+        parallel_config = ParallelConfig(
+            self.pipeline_parallel_size,
+            self.tensor_parallel_size,
+            self.worker_use_ray,
+            self.max_parallel_loading_workers,
+        )
+        scheduler_config = SchedulerConfig(
+            self.max_num_batched_tokens,
+            self.max_num_seqs,
+            model_config.max_model_len,
+            self.max_paddings,
+        )
+        return model_config, cache_config, parallel_config, scheduler_config
+@dataclass
+class AsyncEngineArgs(EngineArgs):
+    """Arguments for asynchronous vLLM engine."""
+    engine_use_ray: bool = False
+    disable_log_requests: bool = False
+    max_log_len: Optional[int] = None
+    @staticmethod
+    def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+        parser = EngineArgs.add_cli_args(parser)
+        parser.add_argument(
+            "--engine-use-ray",
+            action="store_true",
+            help="use Ray to start the LLM engine in a "
+            "separate process as the server process.",
+        )
+        parser.add_argument(
+            "--disable-log-requests",
+            action="store_true",
+            help="disable logging requests",
+        )
+        parser.add_argument(
+            "--max-log-len",
+            type=int,
+            default=None,
+            help="max number of prompt characters or prompt "
+            "ID numbers being printed in log. "
+            "Default: unlimited.",
+        )
+        return parser

ChatTTS/model/velocity/llama.py ADDED Viewed

	@@ -0,0 +1,393 @@

+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only LLaMA model compatible with HuggingFace weights."""
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+from torch import nn
+from transformers import LlamaConfig
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    LinearMethodBase,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding,
+    ParallelLMHead,
+)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_world_size,
+)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.weight_utils import (
+    default_weight_loader,
+    hf_model_weights_iterator,
+)
+from vllm.sequence import SamplerOutput
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+class LlamaMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            linear_method=linear_method,
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size, hidden_size, bias=False, linear_method=linear_method
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+class LlamaAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            linear_method=linear_method,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            linear_method=linear_method,
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = PagedAttention(
+            self.num_heads, self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads
+        )
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+class LlamaDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        self.self_attn = LlamaAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            linear_method=linear_method,
+        )
+        self.mlp = LlamaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            linear_method=linear_method,
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+        )
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+class LlamaModel(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList(
+            [
+                LlamaDecoderLayer(config, linear_method)
+                for _ in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        input_emb: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = input_emb
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                kv_caches[i],
+                input_metadata,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+    def load_weights(
+        self,
+        model_name_or_path: str,
+        cache_dir: Optional[str] = None,
+        load_format: str = "auto",
+        revision: Optional[str] = None,
+    ):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in hf_model_weights_iterator(
+            model_name_or_path, cache_dir, load_format, revision
+        ):
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+class LlamaForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        linear_method: Optional[LinearMethodBase] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.linear_method = linear_method
+        self.model = LlamaModel(config, linear_method)
+        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.sampler = Sampler(config.vocab_size)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, kv_caches, input_metadata)
+        return hidden_states
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(
+            self.lm_head.weight, hidden_states, sampling_metadata
+        )
+        return next_tokens
+    def load_weights(
+        self,
+        model_name_or_path: str,
+        cache_dir: Optional[str] = None,
+        load_format: str = "auto",
+        revision: Optional[str] = None,
+    ):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in hf_model_weights_iterator(
+            model_name_or_path, cache_dir, load_format, revision
+        ):
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)

ChatTTS/model/velocity/llm.py ADDED Viewed

	@@ -0,0 +1,213 @@

+from typing import List, Optional, Union
+from tqdm import tqdm
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+from vllm.utils import Counter
+from .configs import EngineArgs
+from .llm_engine import LLMEngine
+from .output import RequestOutput
+from .sampling_params import SamplingParams
+class LLM:
+    """An LLM for generating texts from given prompts and sampling parameters.
+    This class includes a tokenizer, a language model (possibly distributed
+    across multiple GPUs), and GPU memory space allocated for intermediate
+    states (aka KV cache). Given a batch of prompts and sampling parameters,
+    this class generates texts from the model, using an intelligent batching
+    mechanism and efficient memory management.
+    NOTE: This class is intended to be used for offline inference. For online
+    serving, use the `AsyncLLMEngine` class instead.
+    NOTE: For the comprehensive list of arguments, see `EngineArgs`.
+    Args:
+        model: The name or path of a HuggingFace Transformers model.
+        tokenizer: The name or path of a HuggingFace Transformers tokenizer.
+        tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
+            if available, and "slow" will always use the slow tokenizer.
+        trust_remote_code: Trust remote code (e.g., from HuggingFace) when
+            downloading the model and tokenizer.
+        tensor_parallel_size: The number of GPUs to use for distributed
+            execution with tensor parallelism.
+        dtype: The data type for the model weights and activations. Currently,
+            we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
+            the `torch_dtype` attribute specified in the model config file.
+            However, if the `torch_dtype` in the config is `float32`, we will
+            use `float16` instead.
+        quantization: The method used to quantize the model weights. Currently,
+            we support "awq", "gptq" and "squeezellm". If None, we first check
+            the `quantization_config` attribute in the model config file. If
+            that is None, we assume the model weights are not quantized and use
+            `dtype` to determine the data type of the weights.
+        revision: The specific model version to use. It can be a branch name,
+            a tag name, or a commit id.
+        tokenizer_revision: The specific tokenizer version to use. It can be a
+            branch name, a tag name, or a commit id.
+        seed: The seed to initialize the random number generator for sampling.
+        gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to
+            reserve for the model weights, activations, and KV cache. Higher
+            values will increase the KV cache size and thus improve the model's
+            throughput. However, if the value is too high, it may cause out-of-
+            memory (OOM) errors.
+        swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
+            This can be used for temporarily storing the states of the requests
+            when their `best_of` sampling parameters are larger than 1. If all
+            requests will have `best_of=1`, you can safely set this to 0.
+            Otherwise, too small values may cause out-of-memory (OOM) errors.
+        enforce_eager: Whether to enforce eager execution. If True, we will
+            disable CUDA graph and always execute the model in eager mode.
+            If False, we will use CUDA graph and eager execution in hybrid.
+        max_context_len_to_capture: Maximum context len covered by CUDA graphs.
+            When a sequence has context length larger than this, we fall back
+            to eager mode.
+    """
+    def __init__(
+        self,
+        model: str,
+        tokenizer: Optional[str] = None,
+        tokenizer_mode: str = "auto",
+        trust_remote_code: bool = False,
+        tensor_parallel_size: int = 1,
+        dtype: str = "auto",
+        quantization: Optional[str] = None,
+        revision: Optional[str] = None,
+        tokenizer_revision: Optional[str] = None,
+        seed: int = 0,
+        gpu_memory_utilization: float = 0.9,
+        swap_space: int = 4,
+        enforce_eager: bool = False,
+        max_context_len_to_capture: int = 8192,
+        post_model_path: str = None,
+        num_audio_tokens: int = 0,
+        num_text_tokens: int = 0,
+        **kwargs,
+    ) -> None:
+        if "disable_log_stats" not in kwargs:
+            kwargs["disable_log_stats"] = True
+        engine_args = EngineArgs(
+            model=model,
+            tokenizer=tokenizer,
+            tokenizer_mode=tokenizer_mode,
+            trust_remote_code=trust_remote_code,
+            tensor_parallel_size=tensor_parallel_size,
+            dtype=dtype,
+            quantization=quantization,
+            revision=revision,
+            tokenizer_revision=tokenizer_revision,
+            seed=seed,
+            gpu_memory_utilization=gpu_memory_utilization,
+            swap_space=swap_space,
+            enforce_eager=enforce_eager,
+            max_context_len_to_capture=max_context_len_to_capture,
+            num_audio_tokens=num_audio_tokens,
+            num_text_tokens=num_text_tokens,
+            **kwargs,
+        )
+        self.llm_engine = LLMEngine.from_engine_args(engine_args, post_model_path)
+        self.request_counter = Counter()
+    def get_tokenizer(self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+        return self.llm_engine.tokenizer
+    def set_tokenizer(
+        self,
+        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    ) -> None:
+        self.llm_engine.tokenizer = tokenizer
+    def generate(
+        self,
+        prompts: Optional[Union[str, List[str]]] = None,
+        sampling_params: Optional[SamplingParams] = None,
+        prompt_token_ids: Optional[List[List[int]]] = None,
+        use_tqdm: bool = True,
+    ) -> List[RequestOutput]:
+        """Generates the completions for the input prompts.
+        NOTE: This class automatically batches the given prompts, considering
+        the memory constraint. For the best performance, put all of your prompts
+        into a single list and pass it to this method.
+        Args:
+            prompts: A list of prompts to generate completions for.
+            sampling_params: The sampling parameters for text generation. If
+                None, we use the default sampling parameters.
+            prompt_token_ids: A list of token IDs for the prompts. If None, we
+                use the tokenizer to convert the prompts to token IDs.
+            use_tqdm: Whether to use tqdm to display the progress bar.
+        Returns:
+            A list of `RequestOutput` objects containing the generated
+            completions in the same order as the input prompts.
+        """
+        if prompts is None and prompt_token_ids is None:
+            raise ValueError("Either prompts or prompt_token_ids must be " "provided.")
+        if isinstance(prompts, str):
+            # Convert a single prompt to a list.
+            prompts = [prompts]
+        if (
+            prompts is not None
+            and prompt_token_ids is not None
+            and len(prompts) != len(prompt_token_ids)
+        ):
+            raise ValueError(
+                "The lengths of prompts and prompt_token_ids " "must be the same."
+            )
+        if sampling_params is None:
+            # Use default sampling params.
+            sampling_params = SamplingParams()
+        # Add requests to the engine.
+        num_requests = len(prompts) if prompts is not None else len(prompt_token_ids)
+        for i in range(num_requests):
+            prompt = prompts[i] if prompts is not None else None
+            token_ids = None if prompt_token_ids is None else prompt_token_ids[i]
+            self._add_request(prompt, sampling_params, token_ids)
+        rtns = self._run_engine(use_tqdm)
+        for i, rtn in enumerate(rtns):
+            token_ids = rtn.outputs[0].token_ids
+            for j, token_id in enumerate(token_ids):
+                if len(token_id) == 1:
+                    token_ids[j] = token_id[0]
+                else:
+                    token_ids[j] = list(token_id)
+        return rtns
+    def _add_request(
+        self,
+        prompt: Optional[str],
+        sampling_params: SamplingParams,
+        prompt_token_ids: Optional[List[int]],
+    ) -> None:
+        request_id = str(next(self.request_counter))
+        self.llm_engine.add_request(
+            request_id, prompt, sampling_params, prompt_token_ids
+        )
+    def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:
+        # Initialize tqdm.
+        if use_tqdm:
+            num_requests = self.llm_engine.get_num_unfinished_requests()
+            pbar = tqdm(total=num_requests, desc="Processed prompts")
+        # Run the engine.
+        outputs: List[RequestOutput] = []
+        while self.llm_engine.has_unfinished_requests():
+            step_outputs = self.llm_engine.step()
+            for output in step_outputs:
+                if output.finished:
+                    outputs.append(output)
+                    if use_tqdm:
+                        pbar.update(1)
+        if use_tqdm:
+            pbar.close()
+        # Sort the outputs by request ID.
+        # This is necessary because some requests may be finished earlier than
+        # its previous requests.
+        outputs = sorted(outputs, key=lambda x: int(x.request_id))
+        return outputs

ChatTTS/model/velocity/llm_engine.py ADDED Viewed

	@@ -0,0 +1,833 @@

+import copy
+from collections import defaultdict
+import os
+import time
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union
+from vllm.config import CacheConfig, ModelConfig, ParallelConfig, SchedulerConfig
+from .scheduler import Scheduler, SchedulerOutputs
+from .configs import EngineArgs
+from vllm.engine.metrics import record_metrics
+from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray
+from vllm.logger import init_logger
+from .output import RequestOutput
+from .sampling_params import SamplingParams
+from .sequence import (
+    SamplerOutput,
+    Sequence,
+    SequenceGroup,
+    SequenceGroupOutput,
+    SequenceOutput,
+    SequenceStatus,
+)
+from vllm.transformers_utils.tokenizer import detokenize_incrementally, get_tokenizer
+from vllm.utils import Counter, set_cuda_visible_devices, get_ip, get_open_port
+import numpy as np
+if ray:
+    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+logger = init_logger(__name__)
+_LOGGING_INTERVAL_SEC = 5
+class LLMEngine:
+    """An LLM engine that receives requests and generates texts.
+    This is the main class for the vLLM engine. It receives requests
+    from clients and generates texts from the LLM. It includes a tokenizer, a
+    language model (possibly distributed across multiple GPUs), and GPU memory
+    space allocated for intermediate states (aka KV cache). This class utilizes
+    iteration-level scheduling and efficient memory management to maximize the
+    serving throughput.
+    The `LLM` class wraps this class for offline batched inference and the
+    `AsyncLLMEngine` class wraps this class for online serving.
+    NOTE: The config arguments are derived from the `EngineArgs` class. For the
+    comprehensive list of arguments, see `EngineArgs`.
+    Args:
+        model_config: The configuration related to the LLM model.
+        cache_config: The configuration related to the KV cache memory
+            management.
+        parallel_config: The configuration related to distributed execution.
+        scheduler_config: The configuration related to the request scheduler.
+        placement_group: Ray placement group for distributed execution.
+            Required for distributed execution.
+        log_stats: Whether to log statistics.
+    """
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        cache_config: CacheConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        placement_group: Optional["PlacementGroup"],
+        post_model_path: str,
+        log_stats: bool,
+    ) -> None:
+        logger.info(
+            "Initializing an LLM engine with config: "
+            f"model={model_config.model!r}, "
+            f"tokenizer={model_config.tokenizer!r}, "
+            f"tokenizer_mode={model_config.tokenizer_mode}, "
+            f"revision={model_config.revision}, "
+            f"tokenizer_revision={model_config.tokenizer_revision}, "
+            f"trust_remote_code={model_config.trust_remote_code}, "
+            f"dtype={model_config.dtype}, "
+            f"max_seq_len={model_config.max_model_len}, "
+            f"download_dir={model_config.download_dir!r}, "
+            f"load_format={model_config.load_format}, "
+            f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
+            f"quantization={model_config.quantization}, "
+            f"enforce_eager={model_config.enforce_eager}, "
+            f"seed={model_config.seed}), "
+            f"post_model_path={post_model_path!r}"
+        )
+        # TODO(woosuk): Print more configs in debug mode.
+        self.model_config = model_config
+        self.cache_config = cache_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.log_stats = log_stats
+        self._verify_args()
+        self.post_model_path = post_model_path
+        self.seq_counter = Counter()
+        # Create the parallel GPU workers.
+        if self.parallel_config.worker_use_ray:
+            # Disable Ray usage stats collection.
+            ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
+            if ray_usage != "1":
+                os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
+            self._init_workers_ray(placement_group)
+        else:
+            self._init_workers()
+        # Profile the memory usage and initialize the cache.
+        self._init_cache()
+        # Create the scheduler.
+        self.scheduler = Scheduler(scheduler_config, cache_config)
+        # Logging.
+        self.last_logging_time = 0.0
+        # List of (timestamp, num_tokens)
+        self.num_prompt_tokens: List[Tuple[float, int]] = []
+        # List of (timestamp, num_tokens)
+        self.num_generation_tokens: List[Tuple[float, int]] = []
+    def _init_workers(self):
+        # Lazy import the Worker to avoid importing torch.cuda/xformers
+        # before CUDA_VISIBLE_DEVICES is set in the Worker
+        from .worker import Worker
+        assert (
+            self.parallel_config.world_size == 1
+        ), "Ray is required if parallel_config.world_size > 1."
+        self.workers: List[Worker] = []
+        distributed_init_method = f"tcp://{get_ip()}:{get_open_port()}"
+        self.driver_worker = Worker(
+            self.model_config,
+            self.parallel_config,
+            self.scheduler_config,
+            local_rank=0,
+            rank=0,
+            distributed_init_method=distributed_init_method,
+            is_driver_worker=True,
+            post_model_path=self.post_model_path,
+        )
+        self._run_workers("init_model")
+        self._run_workers("load_model")
+    def _init_workers_ray(self, placement_group: "PlacementGroup", **ray_remote_kwargs):
+        if self.parallel_config.tensor_parallel_size == 1:
+            num_gpus = self.cache_config.gpu_memory_utilization
+        else:
+            num_gpus = 1
+        self.driver_dummy_worker: RayWorkerVllm = None
+        self.workers: List[RayWorkerVllm] = []
+        driver_ip = get_ip()
+        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
+            if not bundle.get("GPU", 0):
+                continue
+            scheduling_strategy = PlacementGroupSchedulingStrategy(
+                placement_group=placement_group,
+                placement_group_capture_child_tasks=True,
+                placement_group_bundle_index=bundle_id,
+            )
+            worker = ray.remote(
+                num_cpus=0,
+                num_gpus=num_gpus,
+                scheduling_strategy=scheduling_strategy,
+                **ray_remote_kwargs,
+            )(RayWorkerVllm).remote(self.model_config.trust_remote_code)
+            worker_ip = ray.get(worker.get_node_ip.remote())
+            if worker_ip == driver_ip and self.driver_dummy_worker is None:
+                # If the worker is on the same node as the driver, we use it
+                # as the resource holder for the driver process.
+                self.driver_dummy_worker = worker
+            else:
+                self.workers.append(worker)
+        if self.driver_dummy_worker is None:
+            raise ValueError(
+                "Ray does not allocate any GPUs on the driver node. Consider "
+                "adjusting the Ray placement group or running the driver on a "
+                "GPU node."
+            )
+        driver_node_id, driver_gpu_ids = ray.get(
+            self.driver_dummy_worker.get_node_and_gpu_ids.remote()
+        )
+        worker_node_and_gpu_ids = ray.get(
+            [worker.get_node_and_gpu_ids.remote() for worker in self.workers]
+        )
+        node_workers = defaultdict(list)
+        node_gpus = defaultdict(list)
+        node_workers[driver_node_id].append(0)
+        node_gpus[driver_node_id].extend(driver_gpu_ids)
+        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids, start=1):
+            node_workers[node_id].append(i)
+            node_gpus[node_id].extend(gpu_ids)
+        for node_id, gpu_ids in node_gpus.items():
+            node_gpus[node_id] = sorted(gpu_ids)
+        # Set CUDA_VISIBLE_DEVICES for the driver.
+        set_cuda_visible_devices(node_gpus[driver_node_id])
+        for worker, (node_id, _) in zip(self.workers, worker_node_and_gpu_ids):
+            worker.set_cuda_visible_devices.remote(node_gpus[node_id])
+        distributed_init_method = f"tcp://{driver_ip}:{get_open_port()}"
+        # Lazy import the Worker to avoid importing torch.cuda/xformers
+        # before CUDA_VISIBLE_DEVICES is set in the Worker
+        from vllm.worker.worker import Worker
+        # Initialize torch distributed process group for the workers.
+        model_config = copy.deepcopy(self.model_config)
+        parallel_config = copy.deepcopy(self.parallel_config)
+        scheduler_config = copy.deepcopy(self.scheduler_config)
+        for rank, (worker, (node_id, _)) in enumerate(
+            zip(self.workers, worker_node_and_gpu_ids), start=1
+        ):
+            local_rank = node_workers[node_id].index(rank)
+            worker.init_worker.remote(
+                lambda rank=rank, local_rank=local_rank: Worker(
+                    model_config,
+                    parallel_config,
+                    scheduler_config,
+                    local_rank,
+                    rank,
+                    distributed_init_method,
+                )
+            )
+        driver_rank = 0
+        driver_local_rank = node_workers[driver_node_id].index(driver_rank)
+        self.driver_worker = Worker(
+            model_config,
+            parallel_config,
+            scheduler_config,
+            driver_local_rank,
+            driver_rank,
+            distributed_init_method,
+            is_driver_worker=True,
+        )
+        self._run_workers("init_model")
+        self._run_workers(
+            "load_model",
+            max_concurrent_workers=self.parallel_config.max_parallel_loading_workers,
+        )
+    def _verify_args(self) -> None:
+        self.model_config.verify_with_parallel_config(self.parallel_config)
+        self.cache_config.verify_with_parallel_config(self.parallel_config)
+    def _init_cache(self) -> None:
+        """Profiles the memory usage and initializes the KV cache."""
+        # Get the maximum number of blocks that can be allocated on GPU and CPU.
+        num_blocks = self._run_workers(
+            "profile_num_available_blocks",
+            block_size=self.cache_config.block_size,
+            gpu_memory_utilization=self.cache_config.gpu_memory_utilization,
+            cpu_swap_space=self.cache_config.swap_space_bytes,
+        )
+        # Since we use a shared centralized controller, we take the minimum
+        # number of blocks across all workers to make sure all the memory
+        # operators can be applied to all workers.
+        num_gpu_blocks = min(b[0] for b in num_blocks)
+        num_cpu_blocks = min(b[1] for b in num_blocks)
+        # FIXME(woosuk): Change to debug log.
+        logger.info(
+            f"# GPU blocks: {num_gpu_blocks}, " f"# CPU blocks: {num_cpu_blocks}"
+        )
+        if num_gpu_blocks <= 0:
+            raise ValueError(
+                "No available memory for the cache blocks. "
+                "Try increasing `gpu_memory_utilization` when "
+                "initializing the engine."
+            )
+        max_seq_len = self.cache_config.block_size * num_gpu_blocks
+        if self.model_config.max_model_len > max_seq_len:
+            raise ValueError(
+                f"The model's max seq len ({self.model_config.max_model_len}) "
+                "is larger than the maximum number of tokens that can be "
+                f"stored in KV cache ({max_seq_len}). Try increasing "
+                "`gpu_memory_utilization` or decreasing `max_model_len` when "
+                "initializing the engine."
+            )
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+        # Initialize the cache.
+        self._run_workers("init_cache_engine", cache_config=self.cache_config)
+        # Warm up the model. This includes capturing the model into CUDA graph
+        # if enforce_eager is False.
+        self._run_workers("warm_up_model")
+    @classmethod
+    def from_engine_args(
+        cls, engine_args: EngineArgs, post_model_path=None
+    ) -> "LLMEngine":
+        """Creates an LLM engine from the engine arguments."""
+        # Create the engine configs.
+        engine_configs = engine_args.create_engine_configs()
+        parallel_config = engine_configs[2]
+        # Initialize the cluster.
+        placement_group = initialize_cluster(parallel_config)
+        # Create the LLM engine.
+        engine = cls(
+            *engine_configs,
+            placement_group,
+            log_stats=not engine_args.disable_log_stats,
+            post_model_path=post_model_path,
+        )
+        return engine
+    def add_request(
+        self,
+        request_id: str,
+        prompt: Optional[str],
+        sampling_params: SamplingParams,
+        prompt_token_ids: Optional[List[int]] = None,
+        arrival_time: Optional[float] = None,
+    ) -> None:
+        """Add a request to the engine's request pool.
+        The request is added to the request pool and will be processed by the
+        scheduler as `engine.step()` is called. The exact scheduling policy is
+        determined by the scheduler.
+        Args:
+            request_id: The unique ID of the request.
+            prompt: The prompt string. Can be None if prompt_token_ids is
+                provided.
+            sampling_params: The sampling parameters for text generation.
+            prompt_token_ids: The token IDs of the prompt. If None, we
+                use the tokenizer to convert the prompts to token IDs.
+            arrival_time: The arrival time of the request. If None, we use
+                the current monotonic time.
+        """
+        if arrival_time is None:
+            arrival_time = time.monotonic()
+        assert prompt_token_ids is not None, "prompt_token_ids must be provided"
+        # Create the sequences.
+        block_size = self.cache_config.block_size
+        seq_id = next(self.seq_counter)
+        seq = Sequence(seq_id, prompt, prompt_token_ids, block_size)
+        # Create the sequence group.
+        seq_group = SequenceGroup(request_id, [seq], sampling_params, arrival_time)
+        # Add the sequence group to the scheduler.
+        self.scheduler.add_seq_group(seq_group)
+    def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
+        """Aborts a request(s) with the given ID.
+        Args:
+            request_id: The ID(s) of the request to abort.
+        """
+        self.scheduler.abort_seq_group(request_id)
+    def get_model_config(self) -> ModelConfig:
+        """Gets the model configuration."""
+        return self.model_config
+    def get_num_unfinished_requests(self) -> int:
+        """Gets the number of unfinished requests."""
+        return self.scheduler.get_num_unfinished_seq_groups()
+    def has_unfinished_requests(self) -> bool:
+        """Returns True if there are unfinished requests."""
+        return self.scheduler.has_unfinished_seqs()
+    def _check_beam_search_early_stopping(
+        self,
+        early_stopping: Union[bool, str],
+        sampling_params: SamplingParams,
+        best_running_seq: Sequence,
+        current_worst_seq: Sequence,
+    ) -> bool:
+        assert sampling_params.use_beam_search
+        length_penalty = sampling_params.length_penalty
+        if early_stopping is True:
+            return True
+        current_worst_score = current_worst_seq.get_beam_search_score(
+            length_penalty=length_penalty, eos_token_id=self.tokenizer.eos_token_id
+        )
+        if early_stopping is False:
+            highest_attainable_score = best_running_seq.get_beam_search_score(
+                length_penalty=length_penalty, eos_token_id=self.tokenizer.eos_token_id
+            )
+        else:
+            assert early_stopping == "never"
+            if length_penalty > 0.0:
+                # If length_penalty > 0.0, beam search will prefer longer
+                # sequences. The highest attainable score calculation is
+                # based on the longest possible sequence length in this case.
+                max_possible_length = max(
+                    best_running_seq.get_prompt_len() + sampling_params.max_tokens,
+                    self.scheduler_config.max_model_len,
+                )
+                highest_attainable_score = best_running_seq.get_beam_search_score(
+                    length_penalty=length_penalty,
+                    eos_token_id=self.tokenizer.eos_token_id,
+                    seq_len=max_possible_length,
+                )
+            else:
+                # Otherwise, beam search will prefer shorter sequences. The
+                # highest attainable score calculation is based on the current
+                # sequence length.
+                highest_attainable_score = best_running_seq.get_beam_search_score(
+                    length_penalty=length_penalty,
+                    eos_token_id=self.tokenizer.eos_token_id,
+                )
+        return current_worst_score >= highest_attainable_score
+    def _process_sequence_group_outputs(
+        self, seq_group: SequenceGroup, outputs: SequenceGroupOutput
+    ) -> None:
+        # Process prompt logprobs
+        prompt_logprobs = outputs.prompt_logprobs
+        if prompt_logprobs is not None:
+            seq_group.prompt_logprobs = prompt_logprobs
+        # Process samples
+        samples = outputs.samples
+        parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
+        existing_finished_seqs = seq_group.get_finished_seqs()
+        parent_child_dict = {parent_seq.seq_id: [] for parent_seq in parent_seqs}
+        for sample in samples:
+            parent_child_dict[sample.parent_seq_id].append(sample)
+        # List of (child, parent)
+        child_seqs: List[Tuple[Sequence, Sequence]] = []
+        # Process the child samples for each parent sequence
+        for parent in parent_seqs:
+            child_samples: List[SequenceOutput] = parent_child_dict[parent.seq_id]
+            if len(child_samples) == 0:
+                # This parent sequence has no children samples. Remove
+                # the parent sequence from the sequence group since it will
+                # not be used in the future iterations.
+                parent.status = SequenceStatus.FINISHED_ABORTED
+                seq_group.remove(parent.seq_id)
+                self.scheduler.free_seq(parent)
+                continue
+            # Fork the parent sequence if there are multiple child samples.
+            for child_sample in child_samples[:-1]:
+                new_child_seq_id = next(self.seq_counter)
+                child = parent.fork(new_child_seq_id)
+                child.append_token_id(
+                    child_sample.output_token,
+                    child_sample.logprobs,
+                    child_sample.hidden_states,
+                    child_sample.finished,
+                )
+                child_seqs.append((child, parent))
+            # Continue the parent sequence for the last child sample.
+            # We reuse the parent sequence here to reduce redundant memory
+            # copies, especially when using non-beam search sampling methods.
+            last_child_sample = child_samples[-1]
+            parent.append_token_id(
+                last_child_sample.output_token,
+                last_child_sample.logprobs,
+                last_child_sample.hidden_states,
+                last_child_sample.finished,
+            )
+            child_seqs.append((parent, parent))
+        for seq, _ in child_seqs:
+            # self._decode_sequence(seq, seq_group.sampling_params)
+            self._check_stop(seq, seq_group.sampling_params)
+        # Non-beam search case
+        if not seq_group.sampling_params.use_beam_search:
+            # For newly created child sequences, add them to the sequence group
+            # and fork them in block manager if they are not finished.
+            for seq, parent in child_seqs:
+                if seq is not parent:
+                    seq_group.add(seq)
+                    if not seq.is_finished():
+                        self.scheduler.fork_seq(parent, seq)
+            # Free the finished and selected parent sequences' memory in block
+            # manager. Keep them in the sequence group as candidate output.
+            # NOTE: we need to fork the new sequences before freeing the
+            # old sequences.
+            for seq, parent in child_seqs:
+                if seq is parent and seq.is_finished():
+                    self.scheduler.free_seq(seq)
+            return
+        # Beam search case
+        # Select the child sequences to keep in the sequence group.
+        selected_child_seqs = []
+        unselected_child_seqs = []
+        beam_width = seq_group.sampling_params.best_of
+        length_penalty = seq_group.sampling_params.length_penalty
+        # Select the newly finished sequences with the highest scores
+        # to replace existing finished sequences.
+        # Tuple of (seq, parent, is_new)
+        existing_finished_seqs = [(seq, None, False) for seq in existing_finished_seqs]
+        new_finished_seqs = [
+            (seq, parent, True) for seq, parent in child_seqs if seq.is_finished()
+        ]
+        all_finished_seqs = existing_finished_seqs + new_finished_seqs
+        # Sort the finished sequences by their scores.
+        all_finished_seqs.sort(
+            key=lambda x: x[0].get_beam_search_score(
+                length_penalty=length_penalty, eos_token_id=self.tokenizer.eos_token_id
+            ),
+            reverse=True,
+        )
+        for seq, parent, is_new in all_finished_seqs[:beam_width]:
+            if is_new:
+                # A newly generated child sequence finishes and has a high
+                # score, so we will add it into the sequence group.
+                selected_child_seqs.append((seq, parent))
+        for seq, parent, is_new in all_finished_seqs[beam_width:]:
+            if is_new:
+                # A newly generated child sequence finishes but has a low
+                # score, so we will not add it into the sequence group.
+                # Additionally, if this sequence is a continuation of a
+                # parent sequence, we will need remove the parent sequence
+                # from the sequence group.
+                unselected_child_seqs.append((seq, parent))
+            else:
+                # An existing finished sequence has a low score, so we will
+                # remove it from the sequence group.
+                seq_group.remove(seq.seq_id)
+        # select the top beam_width sequences from the running
+        # sequences for the next iteration to continue the beam
+        # search.
+        running_child_seqs = [
+            (seq, parent) for seq, parent in child_seqs if not seq.is_finished()
+        ]
+        # Sort the running sequences by their scores.
+        running_child_seqs.sort(
+            key=lambda x: x[0].get_beam_search_score(
+                length_penalty=length_penalty, eos_token_id=self.tokenizer.eos_token_id
+            ),
+            reverse=True,
+        )
+        # Check if we can stop the beam search.
+        if len(running_child_seqs) == 0:
+            # No running sequences, stop the beam search.
+            stop_beam_search = True
+        elif len(all_finished_seqs) < beam_width:
+            # Not enough finished sequences, continue the beam search.
+            stop_beam_search = False
+        else:
+            # Check the early stopping criteria
+            best_running_seq = running_child_seqs[0][0]
+            current_worst_seq = all_finished_seqs[beam_width - 1][0]
+            stop_beam_search = self._check_beam_search_early_stopping(
+                seq_group.sampling_params.early_stopping,
+                seq_group.sampling_params,
+                best_running_seq,
+                current_worst_seq,
+            )
+        if stop_beam_search:
+            # Stop the beam search and remove all the running sequences from
+            # the sequence group.
+            unselected_child_seqs.extend(running_child_seqs)
+        else:
+            # Continue the beam search and select the top beam_width sequences
+            # to continue the beam search.
+            selected_child_seqs.extend(running_child_seqs[:beam_width])
+            # The remaining running sequences will not be used in the next
+            # iteration. Again, if these sequences are continuations of
+            # parent sequences, we will need to remove the parent sequences
+            # from the sequence group.
+            unselected_child_seqs.extend(running_child_seqs[beam_width:])
+        # For newly created child sequences, add them to the sequence group
+        # and fork them in block manager if they are not finished.
+        for seq, parent in selected_child_seqs:
+            if seq is not parent:
+                seq_group.add(seq)
+                if not seq.is_finished():
+                    self.scheduler.fork_seq(parent, seq)
+        # Free the finished and selected parent sequences' memory in block
+        # manager. Keep them in the sequence group as candidate output.
+        for seq, parent in selected_child_seqs:
+            if seq is parent and seq.is_finished():
+                self.scheduler.free_seq(seq)
+        # Remove the unselected parent sequences from the sequence group and
+        # free their memory in block manager.
+        for seq, parent in unselected_child_seqs:
+            if seq is parent:
+                # Remove the parent sequence if it is not selected for next
+                # iteration
+                seq_group.remove(seq.seq_id)
+                self.scheduler.free_seq(seq)
+    def _process_model_outputs(
+        self, output: SamplerOutput, scheduler_outputs: SchedulerOutputs
+    ) -> List[RequestOutput]:
+        # Update the scheduled sequence groups with the model outputs.
+        scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups
+        for seq_group, outputs in zip(scheduled_seq_groups, output):
+            self._process_sequence_group_outputs(seq_group, outputs)
+        # Free the finished sequence groups.
+        self.scheduler.free_finished_seq_groups()
+        # Create the outputs.
+        request_outputs: List[RequestOutput] = []
+        for seq_group in scheduled_seq_groups + scheduler_outputs.ignored_seq_groups:
+            request_output = RequestOutput.from_seq_group(seq_group)
+            request_outputs.append(request_output)
+        if self.log_stats:
+            # Log the system stats.
+            self._log_system_stats(
+                scheduler_outputs.prompt_run, scheduler_outputs.num_batched_tokens
+            )
+        return request_outputs
+    def step(self) -> List[RequestOutput]:
+        """Performs one decoding iteration and returns newly generated results.
+        This function performs one decoding iteration of the engine. It first
+        schedules the sequences to be executed in the next iteration and the
+        token blocks to be swapped in/out/copy. Then, it executes the model
+        and updates the scheduler with the model outputs. Finally, it decodes
+        the sequences and returns the newly generated results.
+        """
+        seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
+        if not scheduler_outputs.is_empty():
+            # Execute the model.
+            all_outputs = self._run_workers(
+                "execute_model",
+                driver_kwargs={
+                    "seq_group_metadata_list": seq_group_metadata_list,
+                    "blocks_to_swap_in": scheduler_outputs.blocks_to_swap_in,
+                    "blocks_to_swap_out": scheduler_outputs.blocks_to_swap_out,
+                    "blocks_to_copy": scheduler_outputs.blocks_to_copy,
+                },
+            )
+            # Only the driver worker returns the sampling results.
+            output = all_outputs[0]
+        else:
+            output = []
+        return self._process_model_outputs(output, scheduler_outputs)
+    def _log_system_stats(
+        self,
+        prompt_run: bool,
+        num_batched_tokens: int,
+    ) -> None:
+        now = time.monotonic()
+        # Log the number of batched input tokens.
+        if prompt_run:
+            self.num_prompt_tokens.append((now, num_batched_tokens))
+        else:
+            self.num_generation_tokens.append((now, num_batched_tokens))
+        should_log = now - self.last_logging_time >= _LOGGING_INTERVAL_SEC
+        if not should_log:
+            return
+        # Discard the old stats.
+        self.num_prompt_tokens = [
+            (t, n) for t, n in self.num_prompt_tokens if now - t < _LOGGING_INTERVAL_SEC
+        ]
+        self.num_generation_tokens = [
+            (t, n)
+            for t, n in self.num_generation_tokens
+            if now - t < _LOGGING_INTERVAL_SEC
+        ]
+        if len(self.num_prompt_tokens) > 1:
+            total_num_tokens = sum(n for _, n in self.num_prompt_tokens[:-1])
+            window = now - self.num_prompt_tokens[0][0]
+            avg_prompt_throughput = total_num_tokens / window
+        else:
+            avg_prompt_throughput = 0.0
+        if len(self.num_generation_tokens) > 1:
+            total_num_tokens = sum(n for _, n in self.num_generation_tokens[:-1])
+            window = now - self.num_generation_tokens[0][0]
+            avg_generation_throughput = total_num_tokens / window
+        else:
+            avg_generation_throughput = 0.0
+        total_num_gpu_blocks = self.cache_config.num_gpu_blocks
+        num_free_gpu_blocks = self.scheduler.block_manager.get_num_free_gpu_blocks()
+        num_used_gpu_blocks = total_num_gpu_blocks - num_free_gpu_blocks
+        gpu_cache_usage = num_used_gpu_blocks / total_num_gpu_blocks
+        total_num_cpu_blocks = self.cache_config.num_cpu_blocks
+        if total_num_cpu_blocks > 0:
+            num_free_cpu_blocks = self.scheduler.block_manager.get_num_free_cpu_blocks()
+            num_used_cpu_blocks = total_num_cpu_blocks - num_free_cpu_blocks
+            cpu_cache_usage = num_used_cpu_blocks / total_num_cpu_blocks
+        else:
+            cpu_cache_usage = 0.0
+        record_metrics(
+            avg_prompt_throughput=avg_prompt_throughput,
+            avg_generation_throughput=avg_generation_throughput,
+            scheduler_running=len(self.scheduler.running),
+            scheduler_swapped=len(self.scheduler.swapped),
+            scheduler_waiting=len(self.scheduler.waiting),
+            gpu_cache_usage=gpu_cache_usage,
+            cpu_cache_usage=cpu_cache_usage,
+        )
+        logger.info(
+            "Avg prompt throughput: "
+            f"{avg_prompt_throughput:.1f} tokens/s, "
+            "Avg generation throughput: "
+            f"{avg_generation_throughput:.1f} tokens/s, "
+            f"Running: {len(self.scheduler.running)} reqs, "
+            f"Swapped: {len(self.scheduler.swapped)} reqs, "
+            f"Pending: {len(self.scheduler.waiting)} reqs, "
+            f"GPU KV cache usage: {gpu_cache_usage * 100:.1f}%, "
+            f"CPU KV cache usage: {cpu_cache_usage * 100:.1f}%"
+        )
+        self.last_logging_time = now
+    def _decode_sequence(self, seq: Sequence, prms: SamplingParams) -> None:
+        """Decodes the new token for a sequence."""
+        (new_tokens, new_output_text, prefix_offset, read_offset) = (
+            detokenize_incrementally(
+                self.tokenizer,
+                all_input_ids=seq.get_token_ids(),
+                prev_tokens=seq.tokens,
+                prefix_offset=seq.prefix_offset,
+                read_offset=seq.read_offset,
+                skip_special_tokens=prms.skip_special_tokens,
+                spaces_between_special_tokens=prms.spaces_between_special_tokens,
+            )
+        )
+        if seq.tokens is None:
+            seq.tokens = new_tokens
+        else:
+            seq.tokens.extend(new_tokens)
+        seq.prefix_offset = prefix_offset
+        seq.read_offset = read_offset
+        seq.output_text += new_output_text
+    def _check_stop(self, seq: Sequence, sampling_params: SamplingParams) -> None:
+        """Stop the finished sequences."""
+        for stop_str in sampling_params.stop:
+            if seq.output_text.endswith(stop_str):
+                if not sampling_params.include_stop_str_in_output:
+                    # Truncate the output text so that the stop string is
+                    # not included in the output.
+                    seq.output_text = seq.output_text[: -len(stop_str)]
+                seq.status = SequenceStatus.FINISHED_STOPPED
+                return
+        if seq.data.finished:
+            seq.status = SequenceStatus.FINISHED_STOPPED
+            return
+        for token_id in seq.get_last_token_id():
+            if token_id == sampling_params.eos_token:
+                seq.status = SequenceStatus.FINISHED_STOPPED
+                return
+        # Check if the sequence has reached max_model_len.
+        if seq.get_len() > self.scheduler_config.max_model_len:
+            seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
+            return
+        # Check if the sequence has reached max_tokens.
+        if seq.get_output_len() == sampling_params.max_tokens:
+            seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
+            return
+        # Check if the sequence has generated the EOS token.
+        if (not sampling_params.ignore_eos) and seq.get_last_token_id()[
+            0
+        ] == sampling_params.eos_token:
+            seq.status = SequenceStatus.FINISHED_STOPPED
+            return
+    def _run_workers(
+        self,
+        method: str,
+        *args,
+        driver_args: Optional[List[Any]] = None,
+        driver_kwargs: Optional[Dict[str, Any]] = None,
+        max_concurrent_workers: Optional[int] = None,
+        **kwargs,
+    ) -> Any:
+        """Runs the given method on all workers."""
+        if max_concurrent_workers:
+            raise NotImplementedError("max_concurrent_workers is not supported yet.")
+        # Start the ray workers first.
+        ray_worker_outputs = [
+            worker.execute_method.remote(method, *args, **kwargs)
+            for worker in self.workers
+        ]
+        if driver_args is None:
+            driver_args = args
+        if driver_kwargs is None:
+            driver_kwargs = kwargs
+        # Start the driver worker after all the ray workers.
+        driver_worker_output = getattr(self.driver_worker, method)(
+            *driver_args, **driver_kwargs
+        )
+        # Get the results of the ray workers.
+        if self.workers:
+            ray_worker_outputs = ray.get(ray_worker_outputs)
+        return [driver_worker_output] + ray_worker_outputs

ChatTTS/model/velocity/model_loader.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""Utilities for selecting and loading models."""
+import contextlib
+import torch
+import torch.nn as nn
+from vllm.config import ModelConfig
+from vllm.model_executor.models import ModelRegistry
+from vllm.model_executor.weight_utils import get_quant_config, initialize_dummy_weights
+from .llama import LlamaModel
+@contextlib.contextmanager
+def _set_default_torch_dtype(dtype: torch.dtype):
+    """Sets the default torch dtype to the given dtype."""
+    old_dtype = torch.get_default_dtype()
+    torch.set_default_dtype(dtype)
+    yield
+    torch.set_default_dtype(old_dtype)
+def get_model(model_config: ModelConfig) -> nn.Module:
+    # Get the (maybe quantized) linear method.
+    linear_method = None
+    if model_config.quantization is not None:
+        quant_config = get_quant_config(
+            model_config.quantization,
+            model_config.model,
+            model_config.hf_config,
+            model_config.download_dir,
+        )
+        capability = torch.cuda.get_device_capability()
+        capability = capability[0] * 10 + capability[1]
+        if capability < quant_config.get_min_capability():
+            raise ValueError(
+                f"The quantization method {model_config.quantization} is not "
+                "supported for the current GPU. "
+                f"Minimum capability: {quant_config.get_min_capability()}. "
+                f"Current capability: {capability}."
+            )
+        supported_dtypes = quant_config.get_supported_act_dtypes()
+        if model_config.dtype not in supported_dtypes:
+            raise ValueError(
+                f"{model_config.dtype} is not supported for quantization "
+                f"method {model_config.quantization}. Supported dtypes: "
+                f"{supported_dtypes}"
+            )
+        linear_method = quant_config.get_linear_method()
+    with _set_default_torch_dtype(model_config.dtype):
+        # Create a model instance.
+        # The weights will be initialized as empty tensors.
+        with torch.device("cuda"):
+            model = LlamaModel(model_config.hf_config, linear_method)
+        if model_config.load_format == "dummy":
+            # NOTE(woosuk): For accurate performance evaluation, we assign
+            # random values to the weights.
+            initialize_dummy_weights(model)
+        else:
+            # Load the weights from the cached or downloaded files.
+            model.load_weights(
+                model_config.model,
+                model_config.download_dir,
+                model_config.load_format,
+                model_config.revision,
+            )
+    return model.eval()

ChatTTS/model/velocity/model_runner.py ADDED Viewed

	@@ -0,0 +1,817 @@

+import time
+from typing import Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+from .configs import ModelConfig, ParallelConfig, SchedulerConfig
+from vllm.logger import init_logger
+from .model_loader import get_model
+from vllm.model_executor import InputMetadata, SamplingMetadata
+from vllm.model_executor.parallel_utils.communication_op import (
+    broadcast,
+    broadcast_object_list,
+)
+from .sampling_params import SamplingParams, SamplingType
+from .sequence import (
+    SamplerOutput,
+    SequenceData,
+    SequenceGroupMetadata,
+    SequenceGroupOutput,
+    SequenceOutput,
+)
+from vllm.utils import in_wsl
+from ..embed import Embed
+from .sampler import Sampler
+from safetensors.torch import safe_open
+logger = init_logger(__name__)
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+_PAD_SLOT_ID = -1
+# Capture graphs for batch size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256.
+# NOTE: _get_graph_batch_size needs to be updated if this list is changed.
+_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [8 * i for i in range(1, 33)]
+class ModelRunner:
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        is_driver_worker: bool = False,
+        post_model_path: str = None,
+    ):
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.is_driver_worker = is_driver_worker
+        self.post_model_path = post_model_path
+        # model_config can be None in tests/samplers/test_sampler.py.
+        # FIXME(woosuk): This is a hack to make the tests work. Refactor this.
+        self.sliding_window = (
+            model_config.get_sliding_window() if model_config is not None else None
+        )
+        self.model = None
+        self.block_size = None  # Set after initial profiling.
+        self.graph_runners: Dict[int, CUDAGraphRunner] = {}
+        self.graph_memory_pool = None  # Set during graph capture.
+        self.max_context_len_to_capture = (
+            self.model_config.max_context_len_to_capture
+            if self.model_config is not None
+            else 0
+        )
+        # When using CUDA graph, the input block tables must be padded to
+        # max_context_len_to_capture. However, creating the block table in
+        # Python can be expensive. To optimize this, we cache the block table
+        # in numpy and only copy the actual input content at every iteration.
+        # The shape of the cached block table will be
+        # (max batch size to capture, max context len to capture / block size).
+        self.graph_block_tables = None  # Set after initial profiling.
+        # cache in_wsl result
+        self.in_wsl = in_wsl()
+    def load_model(self) -> None:
+        self.model = get_model(self.model_config)
+        self.post_model = Embed(
+            self.model_config.get_hidden_size(),
+            self.model_config.num_audio_tokens,
+            self.model_config.num_text_tokens,
+        )
+        state_dict_tensors = {}
+        with safe_open(self.post_model_path, framework="pt", device=0) as f:
+            for k in f.keys():
+                state_dict_tensors[k] = f.get_tensor(k)
+        self.post_model.load_state_dict(state_dict_tensors)
+        self.post_model.to(next(self.model.parameters())).eval()
+        self.sampler = Sampler(self.post_model, self.model_config.num_audio_tokens, 4)
+    def set_block_size(self, block_size: int) -> None:
+        self.block_size = block_size
+        max_num_blocks = (
+            self.max_context_len_to_capture + block_size - 1
+        ) // block_size
+        self.graph_block_tables = np.zeros(
+            (max(_BATCH_SIZES_TO_CAPTURE), max_num_blocks), dtype=np.int32
+        )
+    def _prepare_prompt(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> Tuple[torch.Tensor, torch.Tensor, InputMetadata, List[int]]:
+        assert len(seq_group_metadata_list) > 0
+        input_tokens: List[List[int]] = []
+        input_positions: List[List[int]] = []
+        slot_mapping: List[List[int]] = []
+        prompt_lens: List[int] = []
+        for seq_group_metadata in seq_group_metadata_list:
+            assert seq_group_metadata.is_prompt
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            assert len(seq_ids) == 1
+            seq_id = seq_ids[0]
+            seq_data = seq_group_metadata.seq_data[seq_id]
+            prompt_tokens = seq_data.get_token_ids()
+            prompt_len = len(prompt_tokens)
+            prompt_lens.append(prompt_len)
+            input_tokens.append(prompt_tokens)
+            # NOTE(woosuk): Here we assume that the first token in the prompt
+            # is always the first token in the sequence.
+            input_positions.append(list(range(prompt_len)))
+            if seq_group_metadata.block_tables is None:
+                # During memory profiling, the block tables are not initialized
+                # yet. In this case, we just use a dummy slot mapping.
+                slot_mapping.append([_PAD_SLOT_ID] * prompt_len)
+                continue
+            # Compute the slot mapping.
+            slot_mapping.append([])
+            block_table = seq_group_metadata.block_tables[seq_id]
+            # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID,
+            # where start_idx is max(0, prompt_len - sliding_window).
+            # For example, if the prompt len is 10, sliding window is 8, and
+            # block size is 4, the first two tokens are masked and the slot
+            # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
+            start_idx = 0
+            if self.sliding_window is not None:
+                start_idx = max(0, prompt_len - self.sliding_window)
+            for i in range(prompt_len):
+                if i < start_idx:
+                    slot_mapping[-1].append(_PAD_SLOT_ID)
+                    continue
+                block_number = block_table[i // self.block_size]
+                block_offset = i % self.block_size
+                slot = block_number * self.block_size + block_offset
+                slot_mapping[-1].append(slot)
+        max_prompt_len = max(prompt_lens)
+        input_tokens = _make_tensor_with_pad(
+            input_tokens, max_prompt_len, pad=0, dtype=torch.long
+        )
+        input_positions = _make_tensor_with_pad(
+            input_positions, max_prompt_len, pad=0, dtype=torch.long
+        )
+        slot_mapping = _make_tensor_with_pad(
+            slot_mapping, max_prompt_len, pad=_PAD_SLOT_ID, dtype=torch.long
+        )
+        input_metadata = InputMetadata(
+            is_prompt=True,
+            slot_mapping=slot_mapping,
+            max_context_len=None,
+            context_lens=None,
+            block_tables=None,
+            use_cuda_graph=False,
+        )
+        return input_tokens, input_positions, input_metadata, prompt_lens
+    def _prepare_decode(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> Tuple[torch.Tensor, torch.Tensor, InputMetadata]:
+        assert len(seq_group_metadata_list) > 0
+        input_tokens: List[List[int]] = []
+        input_positions: List[List[int]] = []
+        slot_mapping: List[List[int]] = []
+        context_lens: List[int] = []
+        block_tables: List[List[int]] = []
+        for seq_group_metadata in seq_group_metadata_list:
+            assert not seq_group_metadata.is_prompt
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            for seq_id in seq_ids:
+                seq_data = seq_group_metadata.seq_data[seq_id]
+                generation_token = seq_data.get_last_token_id()
+                input_tokens.append([generation_token])
+                seq_len = seq_data.get_len()
+                position = seq_len - 1
+                input_positions.append([position])
+                context_len = (
+                    seq_len
+                    if self.sliding_window is None
+                    else min(seq_len, self.sliding_window)
+                )
+                context_lens.append(context_len)
+                block_table = seq_group_metadata.block_tables[seq_id]
+                block_number = block_table[position // self.block_size]
+                block_offset = position % self.block_size
+                slot = block_number * self.block_size + block_offset
+                slot_mapping.append([slot])
+                if self.sliding_window is not None:
+                    sliding_window_blocks = self.sliding_window // self.block_size
+                    block_table = block_table[-sliding_window_blocks:]
+                block_tables.append(block_table)
+        batch_size = len(input_tokens)
+        max_context_len = max(context_lens)
+        use_captured_graph = (
+            not self.model_config.enforce_eager
+            and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1]
+            and max_context_len <= self.max_context_len_to_capture
+        )
+        if use_captured_graph:
+            # Pad the input tokens, positions, and slot mapping to match the
+            # batch size of the captured graph.
+            graph_batch_size = _get_graph_batch_size(batch_size)
+            assert graph_batch_size >= batch_size
+            for _ in range(graph_batch_size - batch_size):
+                input_tokens.append([])
+                input_positions.append([])
+                slot_mapping.append([])
+                context_lens.append(1)
+                block_tables.append([])
+            batch_size = graph_batch_size
+        input_tokens = _make_tensor_with_pad(
+            input_tokens, max_len=1, pad=0, dtype=torch.long, device="cuda"
+        )
+        input_positions = _make_tensor_with_pad(
+            input_positions, max_len=1, pad=0, dtype=torch.long, device="cuda"
+        )
+        slot_mapping = _make_tensor_with_pad(
+            slot_mapping, max_len=1, pad=_PAD_SLOT_ID, dtype=torch.long, device="cuda"
+        )
+        context_lens = torch.tensor(context_lens, dtype=torch.int, device="cuda")
+        if use_captured_graph:
+            # The shape of graph_block_tables is
+            # [max batch size, max context len // block size].
+            input_block_tables = self.graph_block_tables[:batch_size]
+            for i, block_table in enumerate(block_tables):
+                if block_table:
+                    input_block_tables[i, : len(block_table)] = block_table
+            block_tables = torch.tensor(input_block_tables, device="cuda")
+        else:
+            block_tables = _make_tensor_with_pad(
+                block_tables,
+                max_len=max_context_len,
+                pad=0,
+                dtype=torch.int,
+                device="cuda",
+            )
+        input_metadata = InputMetadata(
+            is_prompt=False,
+            slot_mapping=slot_mapping,
+            max_context_len=max_context_len,
+            context_lens=context_lens,
+            block_tables=block_tables,
+            use_cuda_graph=use_captured_graph,
+        )
+        return input_tokens, input_positions, input_metadata
+    def _prepare_sample(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        prompt_lens: List[int],
+    ) -> SamplingMetadata:
+        seq_groups: List[Tuple[List[int], SamplingParams]] = []
+        selected_token_indices: List[int] = []
+        selected_token_start_idx = 0
+        categorized_sample_indices = {t: [] for t in SamplingType}
+        categorized_sample_indices_start_idx = 0
+        max_prompt_len = max(prompt_lens) if prompt_lens else 1
+        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            sampling_params = seq_group_metadata.sampling_params
+            seq_groups.append((seq_ids, sampling_params))
+            if seq_group_metadata.is_prompt:
+                assert len(seq_ids) == 1
+                prompt_len = prompt_lens[i]
+                if sampling_params.prompt_logprobs is not None:
+                    # NOTE: prompt token positions do not need sample, skip
+                    categorized_sample_indices_start_idx += prompt_len - 1
+                categorized_sample_indices[sampling_params.sampling_type].append(
+                    categorized_sample_indices_start_idx
+                )
+                categorized_sample_indices_start_idx += 1
+                if sampling_params.prompt_logprobs is not None:
+                    selected_token_indices.extend(
+                        range(
+                            selected_token_start_idx,
+                            selected_token_start_idx + prompt_len - 1,
+                        )
+                    )
+                selected_token_indices.append(selected_token_start_idx + prompt_len - 1)
+                selected_token_start_idx += max_prompt_len
+            else:
+                num_seqs = len(seq_ids)
+                selected_token_indices.extend(
+                    range(selected_token_start_idx, selected_token_start_idx + num_seqs)
+                )
+                selected_token_start_idx += num_seqs
+                categorized_sample_indices[sampling_params.sampling_type].extend(
+                    range(
+                        categorized_sample_indices_start_idx,
+                        categorized_sample_indices_start_idx + num_seqs,
+                    )
+                )
+                categorized_sample_indices_start_idx += num_seqs
+        selected_token_indices = _async_h2d(
+            selected_token_indices, dtype=torch.long, pin_memory=not self.in_wsl
+        )
+        categorized_sample_indices = {
+            t: _async_h2d(seq_ids, dtype=torch.int, pin_memory=not self.in_wsl)
+            for t, seq_ids in categorized_sample_indices.items()
+        }
+        seq_data: Dict[int, SequenceData] = {}
+        for seq_group_metadata in seq_group_metadata_list:
+            seq_data.update(seq_group_metadata.seq_data)
+        sampling_metadata = SamplingMetadata(
+            seq_groups=seq_groups,
+            seq_data=seq_data,
+            prompt_lens=prompt_lens,
+            selected_token_indices=selected_token_indices,
+            categorized_sample_indices=categorized_sample_indices,
+        )
+        return sampling_metadata
+    def prepare_input_tensors(
+        self,
+        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+    ) -> Tuple[torch.Tensor, torch.Tensor, InputMetadata, SamplingMetadata]:
+        if self.is_driver_worker:
+            # NOTE: We assume that all sequences in the group are all prompts or
+            # all decodes.
+            is_prompt = seq_group_metadata_list[0].is_prompt
+            # Prepare input tensors.
+            if is_prompt:
+                (input_tokens, input_positions, input_metadata, prompt_lens) = (
+                    self._prepare_prompt(seq_group_metadata_list)
+                )
+            else:
+                (input_tokens, input_positions, input_metadata) = self._prepare_decode(
+                    seq_group_metadata_list
+                )
+                prompt_lens = []
+            sampling_metadata = self._prepare_sample(
+                seq_group_metadata_list, prompt_lens
+            )
+            def get_size_or_none(x: Optional[torch.Tensor]):
+                return x.size() if x is not None else None
+            # Broadcast the input data. For input tensors, we first broadcast
+            # its shape and then broadcast the tensor to avoid high
+            # serialization cost.
+            py_data = {
+                "input_tokens_size": input_tokens.size(),
+                "input_positions_size": input_positions.size(),
+                "is_prompt": input_metadata.is_prompt,
+                "slot_mapping_size": get_size_or_none(input_metadata.slot_mapping),
+                "max_context_len": input_metadata.max_context_len,
+                "context_lens_size": get_size_or_none(input_metadata.context_lens),
+                "block_tables_size": get_size_or_none(input_metadata.block_tables),
+                "use_cuda_graph": input_metadata.use_cuda_graph,
+                "selected_token_indices_size": sampling_metadata.selected_token_indices.size(),
+            }
+            broadcast_object_list([py_data], src=0)
+            # TODO(zhuohan): Combine the broadcasts or set async_op=True.
+            broadcast(input_tokens, src=0)
+            broadcast(input_positions, src=0)
+            if input_metadata.slot_mapping is not None:
+                broadcast(input_metadata.slot_mapping, src=0)
+            if input_metadata.context_lens is not None:
+                broadcast(input_metadata.context_lens, src=0)
+            if input_metadata.block_tables is not None:
+                broadcast(input_metadata.block_tables, src=0)
+            broadcast(sampling_metadata.selected_token_indices, src=0)
+        else:
+            receving_list = [None]
+            broadcast_object_list(receving_list, src=0)
+            py_data = receving_list[0]
+            input_tokens = torch.empty(
+                *py_data["input_tokens_size"], dtype=torch.long, device="cuda"
+            )
+            broadcast(input_tokens, src=0)
+            input_positions = torch.empty(
+                *py_data["input_positions_size"], dtype=torch.long, device="cuda"
+            )
+            broadcast(input_positions, src=0)
+            if py_data["slot_mapping_size"] is not None:
+                slot_mapping = torch.empty(
+                    *py_data["slot_mapping_size"], dtype=torch.long, device="cuda"
+                )
+                broadcast(slot_mapping, src=0)
+            else:
+                slot_mapping = None
+            if py_data["context_lens_size"] is not None:
+                context_lens = torch.empty(
+                    *py_data["context_lens_size"], dtype=torch.int, device="cuda"
+                )
+                broadcast(context_lens, src=0)
+            else:
+                context_lens = None
+            if py_data["block_tables_size"] is not None:
+                block_tables = torch.empty(
+                    *py_data["block_tables_size"], dtype=torch.int, device="cuda"
+                )
+                broadcast(block_tables, src=0)
+            else:
+                block_tables = None
+            selected_token_indices = torch.empty(
+                *py_data["selected_token_indices_size"], dtype=torch.long, device="cuda"
+            )
+            broadcast(selected_token_indices, src=0)
+            input_metadata = InputMetadata(
+                is_prompt=py_data["is_prompt"],
+                slot_mapping=slot_mapping,
+                max_context_len=py_data["max_context_len"],
+                context_lens=context_lens,
+                block_tables=block_tables,
+                use_cuda_graph=py_data["use_cuda_graph"],
+            )
+            sampling_metadata = SamplingMetadata(
+                seq_groups=None,
+                seq_data=None,
+                prompt_lens=None,
+                selected_token_indices=selected_token_indices,
+                categorized_sample_indices=None,
+                perform_sampling=False,
+            )
+        return input_tokens, input_positions, input_metadata, sampling_metadata
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
+    ) -> Optional[SamplerOutput]:
+        input_tokens, input_positions, input_metadata, sampling_metadata = (
+            self.prepare_input_tensors(seq_group_metadata_list)
+        )
+        # print(sampling_metadata.seq_data)
+        seq_groups = []
+        input_tokens_history = []
+        for i, rtn in enumerate(sampling_metadata.seq_groups):
+            seq_groups.append(rtn[0][0])
+            tokens_history = sampling_metadata.seq_data[rtn[0][0]].output_token_ids
+            if len(tokens_history) >= 1:
+                if len(tokens_history[0]) == 1:
+                    tokens_history = [token[0] for token in tokens_history]
+                else:
+                    tokens_history = [list(token) for token in tokens_history]
+            input_tokens_history.append(tokens_history)
+        input_tokens_history = torch.tensor(input_tokens_history).to(
+            input_tokens.device
+        )
+        # token_ids = rtn.outputs[0].token_ids
+        # for j, token_id in enumerate(token_ids):
+        #     if len(token_id) == 1:
+        #         token_ids[j] = token_id[0]
+        #     else:
+        #         token_ids[j] = list(token_id)
+        # Execute the model.
+        # print("it1",input_tokens)
+        if len(input_tokens.shape) == 2:
+            input_tokens = input_tokens.unsqueeze(2).repeat(1, 1, 4)
+        if len(input_tokens_history.shape) == 2:
+            input_tokens_history = input_tokens_history.unsqueeze(2).repeat(1, 1, 4)
+        # print(input_tokens_history.shape)
+        # print("it2",input_tokens.shape)
+        text_mask = input_tokens != 0
+        text_mask = text_mask[:, :, 0]
+        if input_metadata.use_cuda_graph:
+            graph_batch_size = input_tokens.shape[0]
+            model_executable = self.graph_runners[graph_batch_size]
+        else:
+            model_executable = self.model
+        infer_text = sampling_metadata.seq_groups[0][1].infer_text
+        temperture = sampling_metadata.seq_groups[0][1].temperature
+        if not infer_text:
+            temperture = torch.tensor(temperture).to(input_tokens.device)
+        logits_processors, logits_warpers = sampling_metadata.seq_groups[0][
+            1
+        ].logits_processors
+        # print(logits_processors, logits_warpers)
+        min_new_token = sampling_metadata.seq_groups[0][1].min_new_token
+        eos_token = sampling_metadata.seq_groups[0][1].eos_token
+        start_idx = sampling_metadata.seq_groups[0][1].start_idx
+        if input_tokens.shape[-2] == 1:
+            if infer_text:
+                input_emb: torch.Tensor = self.post_model.emb_text(
+                    input_tokens[:, :, 0]
+                )
+            else:
+                code_emb = [
+                    self.post_model.emb_code[i](input_tokens[:, :, i])
+                    for i in range(self.post_model.num_vq)
+                ]
+                input_emb = torch.stack(code_emb, 3).sum(3)
+                start_idx = (
+                    input_tokens_history.shape[-2] - 1
+                    if input_tokens_history.shape[-2] > 0
+                    else 0
+                )
+        else:
+            input_emb = self.post_model(input_tokens, text_mask)
+        # print(input_emb.shape)
+        hidden_states = model_executable(
+            input_emb=input_emb,
+            positions=input_positions,
+            kv_caches=kv_caches,
+            input_metadata=input_metadata,
+        )
+        # print(hidden_states.shape)
+        # print(input_tokens)
+        B_NO_PAD = input_tokens_history.shape[0]
+        input_tokens = input_tokens[:B_NO_PAD, :, :]
+        hidden_states = hidden_states[:B_NO_PAD, :, :]
+        idx_next, logprob, finish = self.sampler.sample(
+            inputs_ids=(
+                input_tokens
+                if input_tokens_history.shape[-2] == 0
+                else input_tokens_history
+            ),
+            hidden_states=hidden_states,
+            infer_text=infer_text,
+            temperature=temperture,
+            logits_processors=logits_processors,
+            logits_warpers=logits_warpers,
+            min_new_token=min_new_token,
+            now_length=1,
+            eos_token=eos_token,
+            start_idx=start_idx,
+        )
+        # print(logprob.shape, idx_next.shape)
+        if len(logprob.shape) == 2:
+            logprob = logprob[:, None, :]
+        logprob = torch.gather(logprob, -1, idx_next.transpose(-1, -2))[:, :, 0]
+        # print("测试",idx_next.shape, logprob.shape)
+        # Sample the next token.
+        # output = self.model.sample(
+        #     hidden_states=hidden_states,
+        #     sampling_metadata=sampling_metadata,
+        # )
+        results = []
+        for i in range(idx_next.shape[0]):
+            idx_next_i = idx_next[i, 0, :].tolist()
+            logprob_i = logprob[i].tolist()
+            tmp_hidden_states = hidden_states[i]
+            if input_tokens[i].shape[-2] != 1:
+                tmp_hidden_states = tmp_hidden_states[-1:, :]
+            result = SequenceGroupOutput(
+                samples=[
+                    SequenceOutput(
+                        parent_seq_id=seq_groups[i],
+                        logprobs={tuple(idx_next_i): logprob_i},
+                        output_token=tuple(idx_next_i),
+                        hidden_states=tmp_hidden_states,
+                        finished=finish[i].item(),
+                    ),
+                ],
+                prompt_logprobs=None,
+            )
+            results.append(result)
+        # print(results)
+        # print(idx_next, idx_next.shape, logprob.shape)
+        return results
+    @torch.inference_mode()
+    def profile_run(self) -> None:
+        # Enable top-k sampling to reflect the accurate memory usage.
+        vocab_size = self.model_config.get_vocab_size()
+        sampling_params = SamplingParams(
+            top_p=0.99, top_k=vocab_size - 1, infer_text=True
+        )
+        max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
+        max_num_seqs = self.scheduler_config.max_num_seqs
+        # Profile memory usage with max_num_sequences sequences and the total
+        # number of tokens equal to max_num_batched_tokens.
+        seqs: List[SequenceGroupMetadata] = []
+        for group_id in range(max_num_seqs):
+            seq_len = max_num_batched_tokens // max_num_seqs + (
+                group_id < max_num_batched_tokens % max_num_seqs
+            )
+            seq_data = SequenceData([0] * seq_len)
+            seq = SequenceGroupMetadata(
+                request_id=str(group_id),
+                is_prompt=True,
+                seq_data={group_id: seq_data},
+                sampling_params=sampling_params,
+                block_tables=None,
+            )
+            seqs.append(seq)
+        # Run the model with the dummy inputs.
+        num_layers = self.model_config.get_num_layers(self.parallel_config)
+        kv_caches = [(None, None)] * num_layers
+        self.execute_model(seqs, kv_caches)
+        torch.cuda.synchronize()
+        return
+    @torch.inference_mode()
+    def capture_model(self, kv_caches: List[KVCache]) -> None:
+        assert not self.model_config.enforce_eager
+        logger.info(
+            "Capturing the model for CUDA graphs. This may lead to "
+            "unexpected consequences if the model is not static. To "
+            "run the model in eager mode, set 'enforce_eager=True' or "
+            "use '--enforce-eager' in the CLI."
+        )
+        logger.info(
+            "CUDA graphs can take additional 1~3 GiB memory per GPU. "
+            "If you are running out of memory, consider decreasing "
+            "`gpu_memory_utilization` or enforcing eager mode."
+        )
+        start_time = time.perf_counter()
+        # Prepare dummy inputs. These will be reused for all batch sizes.
+        max_batch_size = max(_BATCH_SIZES_TO_CAPTURE)
+        input_emb = torch.zeros(
+            max_batch_size,
+            1,
+            self.model_config.get_hidden_size(),
+            dtype=next(self.model.parameters()).dtype,
+        ).cuda()
+        input_positions = torch.zeros(max_batch_size, 1, dtype=torch.long).cuda()
+        slot_mapping = torch.empty(max_batch_size, 1, dtype=torch.long).cuda()
+        slot_mapping.fill_(_PAD_SLOT_ID)
+        context_lens = torch.ones(max_batch_size, dtype=torch.int32).cuda()
+        block_tables = torch.from_numpy(self.graph_block_tables).cuda()
+        # NOTE: Capturing the largest batch size first may help reduce the
+        # memory usage of CUDA graph.
+        for batch_size in reversed(_BATCH_SIZES_TO_CAPTURE):
+            # Create dummy input_metadata.
+            input_metadata = InputMetadata(
+                is_prompt=False,
+                slot_mapping=slot_mapping[:batch_size],
+                max_context_len=self.max_context_len_to_capture,
+                context_lens=context_lens[:batch_size],
+                block_tables=block_tables[:batch_size],
+                use_cuda_graph=True,
+            )
+            graph_runner = CUDAGraphRunner(self.model)
+            graph_runner.capture(
+                input_emb[:batch_size],
+                input_positions[:batch_size],
+                kv_caches,
+                input_metadata,
+                memory_pool=self.graph_memory_pool,
+            )
+            self.graph_memory_pool = graph_runner.graph.pool()
+            self.graph_runners[batch_size] = graph_runner
+        end_time = time.perf_counter()
+        elapsed_time = end_time - start_time
+        # This usually takes < 10 seconds.
+        logger.info(f"Graph capturing finished in {elapsed_time:.0f} secs.")
+class CUDAGraphRunner:
+    def __init__(self, model: nn.Module):
+        self.model = model
+        self.graph = None
+        self.input_buffers: Dict[str, torch.Tensor] = {}
+        self.output_buffers: Dict[str, torch.Tensor] = {}
+    def capture(
+        self,
+        input_emb: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        memory_pool,
+    ) -> None:
+        assert self.graph is None
+        # Run the model once without capturing the graph.
+        # This is to make sure that the captured graph does not include the
+        # kernel launches for initial benchmarking (e.g., Triton autotune).
+        self.model(
+            input_emb,
+            positions,
+            kv_caches,
+            input_metadata,
+        )
+        torch.cuda.synchronize()
+        # Capture the graph.
+        self.graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(self.graph, pool=memory_pool):
+            hidden_states = self.model(
+                input_emb,
+                positions,
+                kv_caches,
+                input_metadata,
+            )
+        torch.cuda.synchronize()
+        # Save the input and output buffers.
+        self.input_buffers = {
+            "input_emb": input_emb,
+            "positions": positions,
+            "kv_caches": kv_caches,
+            "slot_mapping": input_metadata.slot_mapping,
+            "context_lens": input_metadata.context_lens,
+            "block_tables": input_metadata.block_tables,
+        }
+        self.output_buffers = {"hidden_states": hidden_states}
+        return
+    def forward(
+        self,
+        input_emb: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        # KV caches are fixed tensors, so we don't need to copy them.
+        del kv_caches
+        # Copy the input tensors to the input buffers.
+        self.input_buffers["input_emb"].copy_(input_emb, non_blocking=True)
+        self.input_buffers["positions"].copy_(positions, non_blocking=True)
+        self.input_buffers["slot_mapping"].copy_(
+            input_metadata.slot_mapping, non_blocking=True
+        )
+        self.input_buffers["context_lens"].copy_(
+            input_metadata.context_lens, non_blocking=True
+        )
+        self.input_buffers["block_tables"].copy_(
+            input_metadata.block_tables, non_blocking=True
+        )
+        # Run the graph.
+        self.graph.replay()
+        # Return the output tensor.
+        return self.output_buffers["hidden_states"]
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+def _pad_to_max(x: List[int], max_len: int, pad: int) -> List[int]:
+    assert len(x) <= max_len
+    if len(x) == max_len:
+        return list(x)
+    return list(x) + [pad] * (max_len - len(x))
+def _make_tensor_with_pad(
+    x: List[List[int]],
+    max_len: int,
+    pad: int,
+    dtype: torch.dtype,
+    device: Union[str, torch.device] = "cuda",
+    pin_memory: bool = False,
+) -> torch.Tensor:
+    padded_x = []
+    for x_i in x:
+        pad_i = pad
+        if isinstance(x[0][0], tuple):
+            pad_i = (0,) * len(x[0][0])
+        padded_x.append(_pad_to_max(x_i, max_len, pad_i))
+    return torch.tensor(
+        padded_x,
+        dtype=dtype,
+        device=device,
+        pin_memory=pin_memory and str(device) == "cpu",
+    )
+def _get_graph_batch_size(batch_size: int) -> int:
+    if batch_size <= 2:
+        return batch_size
+    elif batch_size <= 4:
+        return 4
+    else:
+        return (batch_size + 7) // 8 * 8
+def _async_h2d(data: list, dtype, pin_memory):
+    t = torch.tensor(data, dtype=dtype, pin_memory=pin_memory)
+    return t.to(device="cuda", non_blocking=True)

ChatTTS/model/velocity/output.py ADDED Viewed

	@@ -0,0 +1,144 @@

+from typing import List, Optional
+import torch
+from .sequence import (
+    PromptLogprobs,
+    SampleLogprobs,
+    SequenceGroup,
+    SequenceStatus,
+)
+class CompletionOutput:
+    """The output data of one completion output of a request.
+    Args:
+        index: The index of the output in the request.
+        text: The generated output text.
+        token_ids: The token IDs of the generated output text.
+        cumulative_logprob: The cumulative log probability of the generated
+            output text.
+        logprobs: The log probabilities of the top probability words at each
+            position if the logprobs are requested.
+        finish_reason: The reason why the sequence is finished.
+    """
+    def __init__(
+        self,
+        index: int,
+        text: str,
+        token_ids: List[int],
+        cumulative_logprob: float,
+        logprobs: Optional[SampleLogprobs],
+        finish_reason: Optional[str] = None,
+        hidden_states: Optional[torch.Tensor] = None,
+    ) -> None:
+        self.index = index
+        self.text = text
+        self.token_ids = token_ids
+        self.cumulative_logprob = cumulative_logprob
+        self.logprobs = logprobs
+        self.finish_reason = finish_reason
+        self.hidden_states = hidden_states
+    def finished(self) -> bool:
+        return self.finish_reason is not None
+    def __repr__(self) -> str:
+        return (
+            f"CompletionOutput(index={self.index}, "
+            f"text={self.text!r}, "
+            f"token_ids={self.token_ids}, "
+            f"cumulative_logprob={self.cumulative_logprob}, "
+            f"logprobs={self.logprobs}, "
+            f"finish_reason={self.finish_reason}, "
+            f"hidden_states={self.hidden_states.shape if self.hidden_states is not None else None})"
+        )
+class RequestOutput:
+    """The output data of a request to the LLM.
+    Args:
+        request_id: The unique ID of the request.
+        prompt: The prompt string of the request.
+        prompt_token_ids: The token IDs of the prompt.
+        prompt_logprobs: The log probabilities to return per prompt token.
+        outputs: The output sequences of the request.
+        finished: Whether the whole request is finished.
+    """
+    def __init__(
+        self,
+        request_id: str,
+        prompt: str,
+        prompt_token_ids: List[int],
+        prompt_logprobs: Optional[PromptLogprobs],
+        outputs: List[CompletionOutput],
+        finished: bool,
+    ) -> None:
+        self.request_id = request_id
+        self.prompt = prompt
+        self.prompt_token_ids = prompt_token_ids
+        self.prompt_logprobs = prompt_logprobs
+        self.outputs = outputs
+        self.finished = finished
+    @classmethod
+    def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
+        # Get the top-n sequences.
+        n = seq_group.sampling_params.n
+        seqs = seq_group.get_seqs()
+        if seq_group.sampling_params.use_beam_search:
+            sorting_key = lambda seq: seq.get_beam_search_score(
+                seq_group.sampling_params.length_penalty
+            )
+        else:
+            sorting_key = lambda seq: seq.get_cumulative_logprob()
+        sorted_seqs = sorted(seqs, key=sorting_key, reverse=True)
+        top_n_seqs = sorted_seqs[:n]
+        # Create the outputs.
+        outputs: List[CompletionOutput] = []
+        for seq in top_n_seqs:
+            logprobs = seq.output_logprobs
+            if seq_group.sampling_params.logprobs is None:
+                # NOTE: We need to take care of this case because the sequence
+                # always has the logprobs of the sampled tokens even if the
+                # logprobs are not requested.
+                logprobs = None
+            finshed_reason = SequenceStatus.get_finished_reason(seq.status)
+            output = CompletionOutput(
+                seqs.index(seq),
+                seq.output_text,
+                seq.get_output_token_ids(),
+                seq.get_cumulative_logprob(),
+                logprobs,
+                finshed_reason,
+                seq.data.hidden_states,
+            )
+            outputs.append(output)
+        # Every sequence in the sequence group should have the same prompt.
+        prompt = seq_group.prompt
+        prompt_token_ids = seq_group.prompt_token_ids
+        prompt_logprobs = seq_group.prompt_logprobs
+        finished = seq_group.is_finished()
+        return cls(
+            seq_group.request_id,
+            prompt,
+            prompt_token_ids,
+            prompt_logprobs,
+            outputs,
+            finished,
+        )
+    def __repr__(self) -> str:
+        return (
+            f"RequestOutput(request_id={self.request_id}, "
+            f"prompt={self.prompt!r}, "
+            f"prompt_token_ids={self.prompt_token_ids}, "
+            f"prompt_logprobs={self.prompt_logprobs}, "
+            f"outputs={self.outputs}, "
+            f"finished={self.finished})"
+        )

ChatTTS/model/velocity/sampler.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import torch
+from torch.functional import F
+from typing import List, Callable
+from ..embed import Embed
+class Sampler:
+    def __init__(self, post_model: Embed, num_audio_tokens: int, num_vq: int):
+        self.post_model = post_model
+        self.device = next(self.post_model.parameters()).device
+        self.num_audio_tokens = num_audio_tokens
+        self.num_vq = num_vq
+    def sample(
+        self,
+        inputs_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        infer_text: bool = False,
+        temperature: torch.Tensor = 1.0,
+        logits_processors: List[Callable] = [
+            lambda logits_token, logits: logits,
+        ],
+        logits_warpers: List[Callable] = [
+            lambda logits_token, logits: logits,
+        ],
+        min_new_token: int = 0,
+        now_length: int = 0,
+        eos_token: int = 0,
+        start_idx: int = 0,
+    ):
+        # print(inputs_ids.shape)
+        B = hidden_states.shape[0]
+        end_idx = torch.zeros(
+            inputs_ids.shape[0], device=inputs_ids.device, dtype=torch.long
+        )
+        finish = torch.zeros(inputs_ids.shape[0], device=inputs_ids.device).bool()
+        if not infer_text:
+            temperature = (
+                temperature.unsqueeze(0)
+                .expand(inputs_ids.shape[0], -1)
+                .contiguous()
+                .view(-1, 1)
+            )
+        if infer_text:
+            logits: torch.Tensor = self.post_model.head_text(hidden_states)
+        else:
+            # logits = torch.stack([self.head_code[i](hidden_states) for i in range(self.num_vq)], 3)
+            logits = torch.empty(
+                hidden_states.size(0),
+                hidden_states.size(1),
+                self.num_audio_tokens,
+                self.num_vq,
+                dtype=torch.float,
+                device=self.device,
+            )
+            for num_vq_iter in range(self.num_vq):
+                x: torch.Tensor = self.post_model.head_code[num_vq_iter](hidden_states)
+                logits[..., num_vq_iter] = x
+                del x
+        del hidden_states
+        # logits = logits[:, -1].float()
+        logits = logits.narrow(1, -1, 1).squeeze_(1).float()
+        if not infer_text:
+            # logits = rearrange(logits, "b c n -> (b n) c")
+            logits = logits.permute(0, 2, 1)
+            logits = logits.reshape(-1, logits.size(2))
+            # logits_token = rearrange(inputs_ids[:, start_idx:], "b c n -> (b n) c")
+            inputs_ids_sliced = inputs_ids[:, start_idx:].permute(0, 2, 1)
+            logits_token = inputs_ids_sliced.reshape(
+                inputs_ids_sliced.size(0) * inputs_ids_sliced.size(1),
+                -1,
+            ).to(self.device)
+        else:
+            logits_token = inputs_ids[:, start_idx:, 0].to(self.device)
+        logits /= temperature
+        for logitsProcessors in logits_processors:
+            logits = logitsProcessors(logits_token, logits)
+        for logitsWarpers in logits_warpers:
+            logits = logitsWarpers(logits_token, logits)
+        del logits_token
+        if now_length < min_new_token:
+            logits[:, eos_token] = -torch.inf
+        scores = F.softmax(logits, dim=-1)
+        idx_next = torch.multinomial(scores, num_samples=1).to(finish.device)
+        if not infer_text:
+            scores = scores.reshape(B, -1, scores.shape[-1])
+        if not infer_text:
+            # idx_next = rearrange(idx_next, "(b n) 1 -> b n", n=self.num_vq)
+            idx_next = idx_next.view(-1, self.num_vq)
+            finish_or = idx_next.eq(eos_token).any(1)
+            finish.logical_or_(finish_or)
+            del finish_or
+        else:
+            finish_or = idx_next.eq(eos_token).any(1)
+            finish.logical_or_(finish_or)
+            del finish_or
+        del inputs_ids
+        not_finished = finish.logical_not().to(end_idx.device)
+        end_idx.add_(not_finished.int())
+        idx_next = idx_next[:, None, :]
+        return (
+            idx_next,
+            torch.log(scores),
+            finish,
+        )

ChatTTS/model/velocity/sampling_params.py ADDED Viewed

	@@ -0,0 +1,296 @@

+"""Sampling parameters for text generation."""
+from enum import IntEnum
+from functools import cached_property
+from typing import Callable, List, Optional, Union
+import torch
+_SAMPLING_EPS = 1e-5
+class SamplingType(IntEnum):
+    GREEDY = 0
+    RANDOM = 1
+    BEAM = 2
+LogitsProcessor = Callable[[List[int], torch.Tensor], torch.Tensor]
+"""LogitsProcessor is a function that takes a list of previously generated
+tokens and a tensor of the logits for the next token, and returns a modified
+tensor of logits to sample from."""
+class SamplingParams:
+    """Sampling parameters for text generation.
+    Overall, we follow the sampling parameters from the OpenAI text completion
+    API (https://platform.openai.com/docs/api-reference/completions/create).
+    In addition, we support beam search, which is not supported by OpenAI.
+    Args:
+        n: Number of output sequences to return for the given prompt.
+        best_of: Number of output sequences that are generated from the prompt.
+            From these `best_of` sequences, the top `n` sequences are returned.
+            `best_of` must be greater than or equal to `n`. This is treated as
+            the beam width when `use_beam_search` is True. By default, `best_of`
+            is set to `n`.
+        presence_penalty: Float that penalizes new tokens based on whether they
+            appear in the generated text so far. Values > 0 encourage the model
+            to use new tokens, while values < 0 encourage the model to repeat
+            tokens.
+        frequency_penalty: Float that penalizes new tokens based on their
+            frequency in the generated text so far. Values > 0 encourage the
+            model to use new tokens, while values < 0 encourage the model to
+            repeat tokens.
+        repetition_penalty: Float that penalizes new tokens based on whether
+            they appear in the prompt and the generated text so far. Values > 1
+            encourage the model to use new tokens, while values < 1 encourage
+            the model to repeat tokens.
+        temperature: Float that controls the randomness of the sampling. Lower
+            values make the model more deterministic, while higher values make
+            the model more random. Zero means greedy sampling.
+        top_p: Float that controls the cumulative probability of the top tokens
+            to consider. Must be in (0, 1]. Set to 1 to consider all tokens.
+        top_k: Integer that controls the number of top tokens to consider. Set
+            to -1 to consider all tokens.
+        min_p: Float that represents the minimum probability for a token to be
+            considered, relative to the probability of the most likely token.
+            Must be in [0, 1]. Set to 0 to disable this.
+        use_beam_search: Whether to use beam search instead of sampling.
+        length_penalty: Float that penalizes sequences based on their length.
+            Used in beam search.
+        early_stopping: Controls the stopping condition for beam search. It
+            accepts the following values: `True`, where the generation stops as
+            soon as there are `best_of` complete candidates; `False`, where an
+            heuristic is applied and the generation stops when is it very
+            unlikely to find better candidates; `"never"`, where the beam search
+            procedure only stops when there cannot be better candidates
+            (canonical beam search algorithm).
+        stop: List of strings that stop the generation when they are generated.
+            The returned output will not contain the stop strings.
+        stop_token_ids: List of tokens that stop the generation when they are
+            generated. The returned output will contain the stop tokens unless
+            the stop tokens are special tokens.
+        include_stop_str_in_output: Whether to include the stop strings in output
+            text. Defaults to False.
+        ignore_eos: Whether to ignore the EOS token and continue generating
+            tokens after the EOS token is generated.
+        max_tokens: Maximum number of tokens to generate per output sequence.
+        logprobs: Number of log probabilities to return per output token.
+            Note that the implementation follows the OpenAI API: The return
+            result includes the log probabilities on the `logprobs` most likely
+            tokens, as well the chosen tokens. The API will always return the
+            log probability of the sampled token, so there  may be up to
+            `logprobs+1` elements in the response.
+        prompt_logprobs: Number of log probabilities to return per prompt token.
+        skip_special_tokens: Whether to skip special tokens in the output.
+        spaces_between_special_tokens: Whether to add spaces between special
+            tokens in the output.  Defaults to True.
+        logits_processors: List of functions that modify logits based on
+            previously generated tokens.
+    """
+    def __init__(
+        self,
+        n: int = 1,
+        best_of: Optional[int] = None,
+        presence_penalty: float = 0.0,
+        frequency_penalty: float = 0.0,
+        repetition_penalty: float = 1.0,
+        temperature: float = 1.0,
+        top_p: float = 1.0,
+        top_k: int = -1,
+        min_p: float = 0.0,
+        use_beam_search: bool = False,
+        length_penalty: float = 1.0,
+        early_stopping: Union[bool, str] = False,
+        stop: Optional[Union[str, List[str]]] = None,
+        stop_token_ids: Optional[List[int]] = None,
+        include_stop_str_in_output: bool = False,
+        ignore_eos: bool = False,
+        max_tokens: int = 16,
+        logprobs: Optional[int] = None,
+        prompt_logprobs: Optional[int] = None,
+        skip_special_tokens: bool = True,
+        spaces_between_special_tokens: bool = True,
+        logits_processors: Optional[List[LogitsProcessor]] = (
+            [
+                lambda logits_token, logits: logits,
+            ],
+            [
+                lambda logits_token, logits: logits,
+            ],
+        ),
+        min_new_token: int = 0,
+        max_new_token: int = 8192,
+        infer_text: bool = False,
+        eos_token: int = 0,
+        spk_emb: str = None,
+        start_idx: int = 0,
+    ) -> None:
+        self.n = n
+        self.best_of = best_of if best_of is not None else n
+        self.presence_penalty = presence_penalty
+        self.frequency_penalty = frequency_penalty
+        self.repetition_penalty = repetition_penalty
+        self.temperature = temperature
+        self.top_p = top_p
+        self.top_k = top_k
+        self.min_p = min_p
+        self.use_beam_search = use_beam_search
+        self.length_penalty = length_penalty
+        self.early_stopping = early_stopping
+        self.min_new_token = min_new_token
+        self.max_new_token = max_new_token
+        self.infer_text = infer_text
+        self.eos_token = eos_token
+        self.spk_emb = spk_emb
+        self.start_idx = start_idx
+        if stop is None:
+            self.stop = []
+        elif isinstance(stop, str):
+            self.stop = [stop]
+        else:
+            self.stop = list(stop)
+        if stop_token_ids is None:
+            self.stop_token_ids = []
+        else:
+            self.stop_token_ids = list(stop_token_ids)
+        self.ignore_eos = ignore_eos
+        self.max_tokens = max_tokens
+        self.logprobs = logprobs
+        self.prompt_logprobs = prompt_logprobs
+        self.skip_special_tokens = skip_special_tokens
+        self.spaces_between_special_tokens = spaces_between_special_tokens
+        self.logits_processors = logits_processors
+        self.include_stop_str_in_output = include_stop_str_in_output
+        self._verify_args()
+        if self.use_beam_search:
+            self._verify_beam_search()
+        else:
+            self._verify_non_beam_search()
+            # if self.temperature < _SAMPLING_EPS:
+            #     # Zero temperature means greedy sampling.
+            #     self.top_p = 1.0
+            #     self.top_k = -1
+            #     self.min_p = 0.0
+            #     self._verify_greedy_sampling()
+    def _verify_args(self) -> None:
+        if self.n < 1:
+            raise ValueError(f"n must be at least 1, got {self.n}.")
+        if self.best_of < self.n:
+            raise ValueError(
+                f"best_of must be greater than or equal to n, "
+                f"got n={self.n} and best_of={self.best_of}."
+            )
+        if not -2.0 <= self.presence_penalty <= 2.0:
+            raise ValueError(
+                "presence_penalty must be in [-2, 2], got " f"{self.presence_penalty}."
+            )
+        if not -2.0 <= self.frequency_penalty <= 2.0:
+            raise ValueError(
+                "frequency_penalty must be in [-2, 2], got "
+                f"{self.frequency_penalty}."
+            )
+        if not 0.0 < self.repetition_penalty <= 2.0:
+            raise ValueError(
+                "repetition_penalty must be in (0, 2], got "
+                f"{self.repetition_penalty}."
+            )
+        # if self.temperature < 0.0:
+        #     raise ValueError(
+        #         f"temperature must be non-negative, got {self.temperature}.")
+        if not 0.0 < self.top_p <= 1.0:
+            raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
+        if self.top_k < -1 or self.top_k == 0:
+            raise ValueError(
+                f"top_k must be -1 (disable), or at least 1, " f"got {self.top_k}."
+            )
+        if not 0.0 <= self.min_p <= 1.0:
+            raise ValueError("min_p must be in [0, 1], got " f"{self.min_p}.")
+        if self.max_tokens < 1:
+            raise ValueError(f"max_tokens must be at least 1, got {self.max_tokens}.")
+        if self.logprobs is not None and self.logprobs < 0:
+            raise ValueError(f"logprobs must be non-negative, got {self.logprobs}.")
+        if self.prompt_logprobs is not None and self.prompt_logprobs < 0:
+            raise ValueError(
+                f"prompt_logprobs must be non-negative, got " f"{self.prompt_logprobs}."
+            )
+    def _verify_beam_search(self) -> None:
+        if self.best_of == 1:
+            raise ValueError(
+                "best_of must be greater than 1 when using beam "
+                f"search. Got {self.best_of}."
+            )
+        if self.temperature > _SAMPLING_EPS:
+            raise ValueError("temperature must be 0 when using beam search.")
+        if self.top_p < 1.0 - _SAMPLING_EPS:
+            raise ValueError("top_p must be 1 when using beam search.")
+        if self.top_k != -1:
+            raise ValueError("top_k must be -1 when using beam search.")
+        if self.early_stopping not in [True, False, "never"]:
+            raise ValueError(
+                f"early_stopping must be True, False, or 'never', "
+                f"got {self.early_stopping}."
+            )
+    def _verify_non_beam_search(self) -> None:
+        if self.early_stopping is not False:
+            raise ValueError(
+                "early_stopping is not effective and must be "
+                "False when not using beam search."
+            )
+        if (
+            self.length_penalty < 1.0 - _SAMPLING_EPS
+            or self.length_penalty > 1.0 + _SAMPLING_EPS
+        ):
+            raise ValueError(
+                "length_penalty is not effective and must be the "
+                "default value of 1.0 when not using beam search."
+            )
+    def _verify_greedy_sampling(self) -> None:
+        if self.best_of > 1:
+            raise ValueError(
+                "best_of must be 1 when using greedy sampling." f"Got {self.best_of}."
+            )
+    @cached_property
+    def sampling_type(self) -> SamplingType:
+        if self.use_beam_search:
+            return SamplingType.BEAM
+        # if self.temperature < _SAMPLING_EPS:
+        #     return SamplingType.GREEDY
+        return SamplingType.RANDOM
+    def __repr__(self) -> str:
+        return (
+            f"SamplingParams(n={self.n}, "
+            f"best_of={self.best_of}, "
+            f"presence_penalty={self.presence_penalty}, "
+            f"frequency_penalty={self.frequency_penalty}, "
+            f"repetition_penalty={self.repetition_penalty}, "
+            f"temperature={self.temperature}, "
+            f"top_p={self.top_p}, "
+            f"top_k={self.top_k}, "
+            f"min_p={self.min_p}, "
+            f"use_beam_search={self.use_beam_search}, "
+            f"length_penalty={self.length_penalty}, "
+            f"early_stopping={self.early_stopping}, "
+            f"stop={self.stop}, "
+            f"stop_token_ids={self.stop_token_ids}, "
+            f"include_stop_str_in_output={self.include_stop_str_in_output}, "
+            f"ignore_eos={self.ignore_eos}, "
+            f"max_tokens={self.max_tokens}, "
+            f"logprobs={self.logprobs}, "
+            f"prompt_logprobs={self.prompt_logprobs}, "
+            f"skip_special_tokens={self.skip_special_tokens}, "
+            "spaces_between_special_tokens="
+            f"{self.spaces_between_special_tokens}), "
+            f"max_new_token={self.max_new_token}), "
+            f"min_new_token={self.min_new_token}), "
+            f"infer_text={self.infer_text})"
+        )

ChatTTS/model/velocity/scheduler.py ADDED Viewed

	@@ -0,0 +1,426 @@

+import enum
+import time
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+from vllm.config import CacheConfig, SchedulerConfig
+from .block_manager import AllocStatus, BlockSpaceManager
+from vllm.core.policy import PolicyFactory
+from vllm.logger import init_logger
+from .sequence import (
+    Sequence,
+    SequenceData,
+    SequenceGroup,
+    SequenceGroupMetadata,
+    SequenceStatus,
+)
+logger = init_logger(__name__)
+class PreemptionMode(enum.Enum):
+    """Preemption modes.
+    1. Swapping: Swap out the blocks of the preempted sequences to CPU memory
+    and swap them back in when the sequences are resumed.
+    2. Recomputation: Discard the blocks of the preempted sequences and
+    recompute them when the sequences are resumed, treating the sequences as
+    new prompts.
+    """
+    SWAP = enum.auto()
+    RECOMPUTE = enum.auto()
+class SchedulerOutputs:
+    def __init__(
+        self,
+        scheduled_seq_groups: List[SequenceGroup],
+        prompt_run: bool,
+        num_batched_tokens: int,
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+        ignored_seq_groups: List[SequenceGroup],
+    ) -> None:
+        self.scheduled_seq_groups = scheduled_seq_groups
+        self.prompt_run = prompt_run
+        self.num_batched_tokens = num_batched_tokens
+        self.blocks_to_swap_in = blocks_to_swap_in
+        self.blocks_to_swap_out = blocks_to_swap_out
+        self.blocks_to_copy = blocks_to_copy
+        # Swap in and swap out should never happen at the same time.
+        assert not (blocks_to_swap_in and blocks_to_swap_out)
+        self.ignored_seq_groups = ignored_seq_groups
+    def is_empty(self) -> bool:
+        # NOTE: We do not consider the ignored sequence groups.
+        return (
+            not self.scheduled_seq_groups
+            and not self.blocks_to_swap_in
+            and not self.blocks_to_swap_out
+            and not self.blocks_to_copy
+        )
+class Scheduler:
+    def __init__(
+        self,
+        scheduler_config: SchedulerConfig,
+        cache_config: CacheConfig,
+    ) -> None:
+        self.scheduler_config = scheduler_config
+        self.cache_config = cache_config
+        self.prompt_limit = min(
+            self.scheduler_config.max_model_len,
+            self.scheduler_config.max_num_batched_tokens,
+        )
+        # Instantiate the scheduling policy.
+        self.policy = PolicyFactory.get_policy(policy_name="fcfs")
+        # Create the block space manager.
+        self.block_manager = BlockSpaceManager(
+            block_size=self.cache_config.block_size,
+            num_gpu_blocks=self.cache_config.num_gpu_blocks,
+            num_cpu_blocks=self.cache_config.num_cpu_blocks,
+            sliding_window=self.cache_config.sliding_window,
+        )
+        # TODO(zhuohan): Use deque instead of list for better performance.
+        # Sequence groups in the WAITING state.
+        self.waiting: List[SequenceGroup] = []
+        # Sequence groups in the RUNNING state.
+        self.running: List[SequenceGroup] = []
+        # Sequence groups in the SWAPPED state.
+        self.swapped: List[SequenceGroup] = []
+    def add_seq_group(self, seq_group: SequenceGroup) -> None:
+        # Add sequence groups to the waiting queue.
+        self.waiting.append(seq_group)
+    def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
+        if isinstance(request_id, str):
+            request_id = (request_id,)
+        request_ids = set(request_id)
+        for state_queue in [self.waiting, self.running, self.swapped]:
+            # We need to reverse the list as we are removing elements
+            # from it as we iterate over it. If we don't do it,
+            # indices will get messed up and we will skip over elements.
+            for seq_group in reversed(state_queue):
+                if seq_group.request_id in request_ids:
+                    # Remove the sequence group from the state queue.
+                    state_queue.remove(seq_group)
+                    for seq in seq_group.get_seqs():
+                        if seq.is_finished():
+                            continue
+                        seq.status = SequenceStatus.FINISHED_ABORTED
+                        self.free_seq(seq)
+                    request_ids.remove(seq_group.request_id)
+                    if not request_ids:
+                        return
+    def has_unfinished_seqs(self) -> bool:
+        return self.waiting or self.running or self.swapped
+    def get_num_unfinished_seq_groups(self) -> int:
+        return len(self.waiting) + len(self.running) + len(self.swapped)
+    def _schedule(self) -> SchedulerOutputs:
+        # Blocks that need to be swaped or copied before model execution.
+        blocks_to_swap_in: Dict[int, int] = {}
+        blocks_to_swap_out: Dict[int, int] = {}
+        blocks_to_copy: Dict[int, List[int]] = {}
+        # Fix the current time.
+        now = time.monotonic()
+        # Join waiting sequences if possible.
+        if not self.swapped:
+            ignored_seq_groups: List[SequenceGroup] = []
+            scheduled: List[SequenceGroup] = []
+            # The total number of sequences on the fly, including the
+            # requests in the generation phase.
+            num_curr_seqs = sum(
+                seq_group.get_max_num_running_seqs() for seq_group in self.running
+            )
+            seq_lens: List[int] = []
+            # Optimization: We do not sort the waiting queue since the preempted
+            # sequence groups are added to the front and the new sequence groups
+            # are added to the back.
+            while self.waiting:
+                seq_group = self.waiting[0]
+                waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
+                assert len(waiting_seqs) == 1, (
+                    "Waiting sequence group should have only one prompt " "sequence."
+                )
+                num_prompt_tokens = waiting_seqs[0].get_len()
+                if num_prompt_tokens > self.prompt_limit:
+                    logger.warning(
+                        f"Input prompt ({num_prompt_tokens} tokens) is too long"
+                        f" and exceeds limit of {self.prompt_limit}"
+                    )
+                    for seq in waiting_seqs:
+                        seq.status = SequenceStatus.FINISHED_IGNORED
+                    ignored_seq_groups.append(seq_group)
+                    self.waiting.pop(0)
+                    continue
+                # If the sequence group cannot be allocated, stop.
+                can_allocate = self.block_manager.can_allocate(seq_group)
+                if can_allocate == AllocStatus.LATER:
+                    break
+                elif can_allocate == AllocStatus.NEVER:
+                    logger.warning(
+                        f"Input prompt ({num_prompt_tokens} tokens) is too long"
+                        f" and exceeds the capacity of block_manager"
+                    )
+                    for seq in waiting_seqs:
+                        seq.status = SequenceStatus.FINISHED_IGNORED
+                    ignored_seq_groups.append(seq_group)
+                    self.waiting.pop(0)
+                    continue
+                # If the number of batched tokens exceeds the limit, stop.
+                new_seq_lens = seq_lens + [num_prompt_tokens]
+                num_batched_tokens = len(new_seq_lens) * max(new_seq_lens)
+                if num_batched_tokens > self.scheduler_config.max_num_batched_tokens:
+                    break
+                # The total number of sequences in the RUNNING state should not
+                # exceed the maximum number of sequences.
+                num_new_seqs = seq_group.get_max_num_running_seqs()
+                if num_curr_seqs + num_new_seqs > self.scheduler_config.max_num_seqs:
+                    break
+                num_paddings = num_batched_tokens - sum(new_seq_lens)
+                if num_paddings > self.scheduler_config.max_paddings:
+                    break
+                seq_lens = new_seq_lens
+                seq_group = self.waiting.pop(0)
+                self._allocate(seq_group)
+                self.running.append(seq_group)
+                num_curr_seqs += num_new_seqs
+                scheduled.append(seq_group)
+            if scheduled or ignored_seq_groups:
+                scheduler_outputs = SchedulerOutputs(
+                    scheduled_seq_groups=scheduled,
+                    prompt_run=True,
+                    num_batched_tokens=len(seq_lens) * max(seq_lens) if seq_lens else 0,
+                    blocks_to_swap_in=blocks_to_swap_in,
+                    blocks_to_swap_out=blocks_to_swap_out,
+                    blocks_to_copy=blocks_to_copy,
+                    ignored_seq_groups=ignored_seq_groups,
+                )
+                return scheduler_outputs
+        # NOTE(woosuk): Preemption happens only when there is no available slot
+        # to keep all the sequence groups in the RUNNING state.
+        # In this case, the policy is responsible for deciding which sequence
+        # groups to preempt.
+        self.running = self.policy.sort_by_priority(now, self.running)
+        # Reserve new token slots for the running sequence groups.
+        running: List[SequenceGroup] = []
+        preempted: List[SequenceGroup] = []
+        while self.running:
+            seq_group = self.running.pop(0)
+            while not self.block_manager.can_append_slot(seq_group):
+                if self.running:
+                    # Preempt the lowest-priority sequence groups.
+                    victim_seq_group = self.running.pop(-1)
+                    self._preempt(victim_seq_group, blocks_to_swap_out)
+                    preempted.append(victim_seq_group)
+                else:
+                    # No other sequence groups can be preempted.
+                    # Preempt the current sequence group.
+                    self._preempt(seq_group, blocks_to_swap_out)
+                    preempted.append(seq_group)
+                    break
+            else:
+                # Append new slots to the sequence group.
+                self._append_slot(seq_group, blocks_to_copy)
+                running.append(seq_group)
+        self.running = running
+        # Swap in the sequence groups in the SWAPPED state if possible.
+        self.swapped = self.policy.sort_by_priority(now, self.swapped)
+        if not preempted:
+            num_curr_seqs = sum(
+                seq_group.get_max_num_running_seqs() for seq_group in self.running
+            )
+            while self.swapped:
+                seq_group = self.swapped[0]
+                # If the sequence group cannot be swapped in, stop.
+                if not self.block_manager.can_swap_in(seq_group):
+                    break
+                # The total number of sequences in the RUNNING state should not
+                # exceed the maximum number of sequences.
+                num_new_seqs = seq_group.get_max_num_running_seqs()
+                if num_curr_seqs + num_new_seqs > self.scheduler_config.max_num_seqs:
+                    break
+                seq_group = self.swapped.pop(0)
+                self._swap_in(seq_group, blocks_to_swap_in)
+                self._append_slot(seq_group, blocks_to_copy)
+                num_curr_seqs += num_new_seqs
+                self.running.append(seq_group)
+        # Each sequence in the generation phase only takes one token slot.
+        # Therefore, the number of batched tokens is equal to the number of
+        # sequences in the RUNNING state.
+        num_batched_tokens = sum(
+            seq_group.num_seqs(status=SequenceStatus.RUNNING)
+            for seq_group in self.running
+        )
+        scheduler_outputs = SchedulerOutputs(
+            scheduled_seq_groups=self.running,
+            prompt_run=False,
+            num_batched_tokens=num_batched_tokens,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+            ignored_seq_groups=[],
+        )
+        return scheduler_outputs
+    def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
+        # Schedule sequence groups.
+        # This function call changes the internal states of the scheduler
+        # such as self.running, self.swapped, and self.waiting.
+        scheduler_outputs = self._schedule()
+        # Create input data structures.
+        seq_group_metadata_list: List[SequenceGroupMetadata] = []
+        for seq_group in scheduler_outputs.scheduled_seq_groups:
+            seq_data: Dict[int, SequenceData] = {}
+            block_tables: Dict[int, List[int]] = {}
+            for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+                seq_id = seq.seq_id
+                seq_data[seq_id] = seq.data
+                block_tables[seq_id] = self.block_manager.get_block_table(seq)
+            seq_group_metadata = SequenceGroupMetadata(
+                request_id=seq_group.request_id,
+                is_prompt=scheduler_outputs.prompt_run,
+                seq_data=seq_data,
+                sampling_params=seq_group.sampling_params,
+                block_tables=block_tables,
+            )
+            seq_group_metadata_list.append(seq_group_metadata)
+        return seq_group_metadata_list, scheduler_outputs
+    def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None:
+        self.block_manager.fork(parent_seq, child_seq)
+    def free_seq(self, seq: Sequence) -> None:
+        self.block_manager.free(seq)
+    def free_finished_seq_groups(self) -> None:
+        self.running = [
+            seq_group for seq_group in self.running if not seq_group.is_finished()
+        ]
+    def _allocate(self, seq_group: SequenceGroup) -> None:
+        self.block_manager.allocate(seq_group)
+        for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
+            seq.status = SequenceStatus.RUNNING
+    def _append_slot(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_copy: Dict[int, List[int]],
+    ) -> None:
+        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+            ret = self.block_manager.append_slot(seq)
+            if ret is not None:
+                src_block, dst_block = ret
+                if src_block in blocks_to_copy:
+                    blocks_to_copy[src_block].append(dst_block)
+                else:
+                    blocks_to_copy[src_block] = [dst_block]
+    def _preempt(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_swap_out: Dict[int, int],
+        preemption_mode: Optional[PreemptionMode] = None,
+    ) -> None:
+        # If preemption mode is not specified, we determine the mode as follows:
+        # We use recomputation by default since it incurs lower overhead than
+        # swapping. However, when the sequence group has multiple sequences
+        # (e.g., beam search), recomputation is not currently supported. In
+        # such a case, we use swapping instead.
+        # FIXME(woosuk): This makes our scheduling policy a bit bizarre.
+        # As swapped sequences are prioritized over waiting sequences,
+        # sequence groups with multiple sequences are implicitly prioritized
+        # over sequence groups with a single sequence.
+        # TODO(woosuk): Support recomputation for sequence groups with multiple
+        # sequences. This may require a more sophisticated CUDA kernel.
+        if preemption_mode is None:
+            if seq_group.get_max_num_running_seqs() == 1:
+                preemption_mode = PreemptionMode.RECOMPUTE
+            else:
+                preemption_mode = PreemptionMode.SWAP
+        if preemption_mode == PreemptionMode.RECOMPUTE:
+            self._preempt_by_recompute(seq_group)
+        elif preemption_mode == PreemptionMode.SWAP:
+            self._preempt_by_swap(seq_group, blocks_to_swap_out)
+        else:
+            raise AssertionError("Invalid preemption mode.")
+    def _preempt_by_recompute(
+        self,
+        seq_group: SequenceGroup,
+    ) -> None:
+        seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
+        assert len(seqs) == 1
+        for seq in seqs:
+            seq.status = SequenceStatus.WAITING
+            self.block_manager.free(seq)
+        # NOTE: For FCFS, we insert the preempted sequence group to the front
+        # of the waiting queue.
+        self.waiting.insert(0, seq_group)
+    def _preempt_by_swap(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_swap_out: Dict[int, int],
+    ) -> None:
+        self._swap_out(seq_group, blocks_to_swap_out)
+        self.swapped.append(seq_group)
+    def _swap_in(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_swap_in: Dict[int, int],
+    ) -> None:
+        mapping = self.block_manager.swap_in(seq_group)
+        blocks_to_swap_in.update(mapping)
+        for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
+            seq.status = SequenceStatus.RUNNING
+    def _swap_out(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_swap_out: Dict[int, int],
+    ) -> None:
+        if not self.block_manager.can_swap_out(seq_group):
+            # FIXME(woosuk): Abort the sequence group instead of aborting the
+            # entire engine.
+            raise RuntimeError(
+                "Aborted due to the lack of CPU swap space. Please increase "
+                "the swap space to avoid this error."
+            )
+        mapping = self.block_manager.swap_out(seq_group)
+        blocks_to_swap_out.update(mapping)
+        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+            seq.status = SequenceStatus.SWAPPED

ChatTTS/model/velocity/sequence.py ADDED Viewed

	@@ -0,0 +1,450 @@

+"""Sequence and its related classes."""
+import copy
+import enum
+from typing import Dict, List, Optional, Union
+import torch
+from vllm.block import LogicalTokenBlock
+from .sampling_params import SamplingParams
+PromptLogprobs = List[Optional[Dict[int, float]]]
+SampleLogprobs = List[Dict[int, float]]
+class SequenceStatus(enum.Enum):
+    """Status of a sequence."""
+    WAITING = enum.auto()
+    RUNNING = enum.auto()
+    SWAPPED = enum.auto()
+    FINISHED_STOPPED = enum.auto()
+    FINISHED_LENGTH_CAPPED = enum.auto()
+    FINISHED_ABORTED = enum.auto()
+    FINISHED_IGNORED = enum.auto()
+    @staticmethod
+    def is_finished(status: "SequenceStatus") -> bool:
+        return status in [
+            SequenceStatus.FINISHED_STOPPED,
+            SequenceStatus.FINISHED_LENGTH_CAPPED,
+            SequenceStatus.FINISHED_ABORTED,
+            SequenceStatus.FINISHED_IGNORED,
+        ]
+    @staticmethod
+    def get_finished_reason(status: "SequenceStatus") -> Union[str, None]:
+        if status == SequenceStatus.FINISHED_STOPPED:
+            finish_reason = "stop"
+        elif status == SequenceStatus.FINISHED_LENGTH_CAPPED:
+            finish_reason = "length"
+        elif status == SequenceStatus.FINISHED_ABORTED:
+            finish_reason = "abort"
+        elif status == SequenceStatus.FINISHED_IGNORED:
+            # The ignored sequences are the sequences whose prompt lengths
+            # are longer than the model's length cap. Therefore, the stop
+            # reason should also be "length" as in OpenAI API.
+            finish_reason = "length"
+        else:
+            finish_reason = None
+        return finish_reason
+class SequenceData:
+    """Data associated with a sequence.
+    Args:
+        prompt_token_ids: The token IDs of the prompt.
+    Attributes:
+        prompt_token_ids: The token IDs of the prompt.
+        output_token_ids: The token IDs of the output.
+        cumulative_logprob: The cumulative log probability of the output.
+    """
+    def __init__(
+        self,
+        prompt_token_ids: List[int],
+    ) -> None:
+        self.prompt_token_ids = prompt_token_ids
+        self.output_token_ids: List[int] = []
+        self.cumulative_logprob = 0.0
+        self.hidden_states: Optional[torch.Tensor] = None
+        self.finished = False
+    def append_token_id(self, token_id: int, logprob: float) -> None:
+        if isinstance(self.cumulative_logprob, float):
+            self.cumulative_logprob = [
+                0.0,
+            ] * len(logprob)
+        self.output_token_ids.append(token_id)
+        for i in range(len(self.cumulative_logprob)):
+            self.cumulative_logprob[i] += logprob[i]
+    def append_hidden_states(self, hidden_states: torch.Tensor) -> None:
+        if self.hidden_states is None:
+            self.hidden_states = hidden_states
+        else:
+            self.hidden_states = torch.cat([self.hidden_states, hidden_states], dim=0)
+    def get_len(self) -> int:
+        return len(self.output_token_ids) + len(self.prompt_token_ids)
+    def get_prompt_len(self) -> int:
+        return len(self.prompt_token_ids)
+    def get_output_len(self) -> int:
+        return len(self.output_token_ids)
+    def get_token_ids(self) -> List[int]:
+        return self.prompt_token_ids + self.output_token_ids
+    def get_last_token_id(self) -> int:
+        if not self.output_token_ids:
+            return self.prompt_token_ids[-1]
+        return self.output_token_ids[-1]
+    def __repr__(self) -> str:
+        return (
+            f"SequenceData("
+            f"prompt_token_ids={self.prompt_token_ids}, "
+            f"output_token_ids={self.output_token_ids}, "
+            f"cumulative_logprob={self.cumulative_logprob}), "
+            f"hidden_states={self.hidden_states.shape if self.hidden_states is not None else None}, "
+            f"finished={self.finished})"
+        )
+class Sequence:
+    """Stores the data, status, and block information of a sequence.
+    Args:
+        seq_id: The ID of the sequence.
+        prompt: The prompt of the sequence.
+        prompt_token_ids: The token IDs of the prompt.
+        block_size: The block size of the sequence. Should be the same as the
+            block size used by the block manager and cache engine.
+    """
+    def __init__(
+        self,
+        seq_id: int,
+        prompt: str,
+        prompt_token_ids: List[int],
+        block_size: int,
+    ) -> None:
+        self.seq_id = seq_id
+        self.prompt = prompt
+        self.block_size = block_size
+        self.data = SequenceData(prompt_token_ids)
+        self.output_logprobs: SampleLogprobs = []
+        self.output_text = ""
+        self.logical_token_blocks: List[LogicalTokenBlock] = []
+        # Initialize the logical token blocks with the prompt token ids.
+        self._append_tokens_to_blocks(prompt_token_ids)
+        self.status = SequenceStatus.WAITING
+        # Used for incremental detokenization
+        self.prefix_offset = 0
+        self.read_offset = 0
+        # Input + output tokens
+        self.tokens: Optional[List[str]] = None
+    def _append_logical_block(self) -> None:
+        block = LogicalTokenBlock(
+            block_number=len(self.logical_token_blocks),
+            block_size=self.block_size,
+        )
+        self.logical_token_blocks.append(block)
+    def _append_tokens_to_blocks(self, token_ids: List[int]) -> None:
+        cursor = 0
+        while cursor < len(token_ids):
+            if not self.logical_token_blocks:
+                self._append_logical_block()
+            last_block = self.logical_token_blocks[-1]
+            if last_block.is_full():
+                self._append_logical_block()
+                last_block = self.logical_token_blocks[-1]
+            num_empty_slots = last_block.get_num_empty_slots()
+            last_block.append_tokens(token_ids[cursor : cursor + num_empty_slots])
+            cursor += num_empty_slots
+    def append_token_id(
+        self,
+        token_id: int,
+        logprobs: Dict[int, float],
+        hidden_states: Optional[torch.Tensor] = None,
+        finished: bool = False,
+    ) -> None:
+        assert token_id in logprobs
+        self._append_tokens_to_blocks([token_id])
+        self.output_logprobs.append(logprobs)
+        self.data.append_token_id(token_id, logprobs[token_id])
+        self.data.append_hidden_states(hidden_states)
+        self.data.finished = finished
+    def get_len(self) -> int:
+        return self.data.get_len()
+    def get_prompt_len(self) -> int:
+        return self.data.get_prompt_len()
+    def get_output_len(self) -> int:
+        return self.data.get_output_len()
+    def get_token_ids(self) -> List[int]:
+        return self.data.get_token_ids()
+    def get_last_token_id(self) -> int:
+        return self.data.get_last_token_id()
+    def get_output_token_ids(self) -> List[int]:
+        return self.data.output_token_ids
+    def get_cumulative_logprob(self) -> float:
+        return self.data.cumulative_logprob
+    def get_beam_search_score(
+        self,
+        length_penalty: float = 0.0,
+        seq_len: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+    ) -> float:
+        """Calculate the beam search score with length penalty.
+        Adapted from
+        https://github.com/huggingface/transformers/blob/ccb92be23def445f2afdea94c31286f84b89eb5b/src/transformers/generation/beam_search.py#L938
+        """
+        if seq_len is None:
+            seq_len = self.get_len()
+            # NOTE: HF implementation does not count the EOS token
+            # towards the length, we align with that here for testing.
+            if eos_token_id is not None and self.get_last_token_id() == eos_token_id:
+                seq_len -= 1
+        return self.get_cumulative_logprob() / (seq_len**length_penalty)
+    def is_finished(self) -> bool:
+        return SequenceStatus.is_finished(self.status)
+    def fork(self, new_seq_id: int) -> "Sequence":
+        new_seq = copy.deepcopy(self)
+        new_seq.seq_id = new_seq_id
+        return new_seq
+    def __repr__(self) -> str:
+        return (
+            f"Sequence(seq_id={self.seq_id}, "
+            f"status={self.status.name}, "
+            f"num_blocks={len(self.logical_token_blocks)})"
+        )
+class SequenceGroup:
+    """A group of sequences that are generated from the same prompt.
+    Args:
+        request_id: The ID of the request.
+        seqs: The list of sequences.
+        sampling_params: The sampling parameters used to generate the outputs.
+        arrival_time: The arrival time of the request.
+    """
+    def __init__(
+        self,
+        request_id: str,
+        seqs: List[Sequence],
+        sampling_params: SamplingParams,
+        arrival_time: float,
+    ) -> None:
+        self.request_id = request_id
+        self.seqs_dict = {seq.seq_id: seq for seq in seqs}
+        self.sampling_params = sampling_params
+        self.arrival_time = arrival_time
+        self.prompt_logprobs: Optional[PromptLogprobs] = None
+    @property
+    def prompt(self) -> str:
+        # All sequences in the group should have the same prompt.
+        # We use the prompt of an arbitrary sequence.
+        return next(iter(self.seqs_dict.values())).prompt
+    @property
+    def prompt_token_ids(self) -> List[int]:
+        # All sequences in the group should have the same prompt.
+        # We use the prompt of an arbitrary sequence.
+        return next(iter(self.seqs_dict.values())).data.prompt_token_ids
+    def get_max_num_running_seqs(self) -> int:
+        """The maximum number of sequences running in parallel in the remaining
+        lifetime of the request."""
+        if self.sampling_params.use_beam_search:
+            # For beam search, maximally there will always be `best_of` beam
+            # candidates running in the future.
+            return self.sampling_params.best_of
+        else:
+            if self.sampling_params.best_of > self.num_seqs():
+                # At prompt stage, the sequence group is not yet filled up
+                # and only have one sequence running. However, in the
+                # generation stage, we will have `best_of` sequences running.
+                return self.sampling_params.best_of
+            # At sampling stages, return the number of actual sequences
+            # that are not finished yet.
+            return self.num_unfinished_seqs()
+    def get_seqs(
+        self,
+        status: Optional[SequenceStatus] = None,
+    ) -> List[Sequence]:
+        if status is None:
+            return list(self.seqs_dict.values())
+        else:
+            return [seq for seq in self.seqs_dict.values() if seq.status == status]
+    def get_unfinished_seqs(self) -> List[Sequence]:
+        return [seq for seq in self.seqs_dict.values() if not seq.is_finished()]
+    def get_finished_seqs(self) -> List[Sequence]:
+        return [seq for seq in self.seqs_dict.values() if seq.is_finished()]
+    def num_seqs(self, status: Optional[SequenceStatus] = None) -> int:
+        return len(self.get_seqs(status))
+    def num_unfinished_seqs(self) -> int:
+        return len(self.get_unfinished_seqs())
+    def num_finished_seqs(self) -> int:
+        return len(self.get_finished_seqs())
+    def find(self, seq_id: int) -> Sequence:
+        if seq_id not in self.seqs_dict:
+            raise ValueError(f"Sequence {seq_id} not found.")
+        return self.seqs_dict[seq_id]
+    def add(self, seq: Sequence) -> None:
+        if seq.seq_id in self.seqs_dict:
+            raise ValueError(f"Sequence {seq.seq_id} already exists.")
+        self.seqs_dict[seq.seq_id] = seq
+    def remove(self, seq_id: int) -> None:
+        if seq_id not in self.seqs_dict:
+            raise ValueError(f"Sequence {seq_id} not found.")
+        del self.seqs_dict[seq_id]
+    def is_finished(self) -> bool:
+        return all(seq.is_finished() for seq in self.get_seqs())
+    def __repr__(self) -> str:
+        return (
+            f"SequenceGroup(request_id={self.request_id}, "
+            f"sampling_params={self.sampling_params}, "
+            f"num_seqs={len(self.seqs_dict)})"
+        )
+class SequenceGroupMetadata:
+    """Metadata for a sequence group. Used to create `InputMetadata`.
+    Args:
+        request_id: The ID of the request.
+        is_prompt: Whether the request is at prompt stage.
+        seq_data: The sequence data. (Seq id -> sequence data)
+        sampling_params: The sampling parameters used to generate the outputs.
+        block_tables: The block tables. (Seq id -> list of physical block
+            numbers)
+    """
+    def __init__(
+        self,
+        request_id: str,
+        is_prompt: bool,
+        seq_data: Dict[int, SequenceData],
+        sampling_params: SamplingParams,
+        block_tables: Dict[int, List[int]],
+    ) -> None:
+        self.request_id = request_id
+        self.is_prompt = is_prompt
+        self.seq_data = seq_data
+        self.sampling_params = sampling_params
+        self.block_tables = block_tables
+class SequenceOutput:
+    """The model output associated with a sequence.
+    Args:
+        parent_seq_id: The ID of the parent sequence (for forking in beam
+            search).
+        output_token: The output token ID.
+        logprobs: The logprobs of the output token.
+            (Token id -> logP(x_i+1 | x_0, ..., x_i))
+    """
+    def __init__(
+        self,
+        parent_seq_id: int,
+        output_token: int,
+        logprobs: Dict[int, float],
+        hidden_states: Optional[torch.Tensor] = None,
+        finished: bool = False,
+    ) -> None:
+        self.parent_seq_id = parent_seq_id
+        self.output_token = output_token
+        self.logprobs = logprobs
+        self.finished = finished
+        self.hidden_states = hidden_states
+    def __repr__(self) -> str:
+        return (
+            f"SequenceOutput(parent_seq_id={self.parent_seq_id}, "
+            f"output_token={self.output_token}, "
+            f"logprobs={self.logprobs}),"
+            f"finished={self.finished}),"
+            f"hidden_states={self.hidden_states.shape if self.hidden_states is not None else None}"
+        )
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, SequenceOutput):
+            raise NotImplementedError()
+        return (
+            self.parent_seq_id == other.parent_seq_id
+            and self.output_token == other.output_token
+            and self.logprobs == other.logprobs
+        )
+class SequenceGroupOutput:
+    """The model output associated with a sequence group."""
+    def __init__(
+        self,
+        samples: List[SequenceOutput],
+        prompt_logprobs: Optional[PromptLogprobs],
+    ) -> None:
+        self.samples = samples
+        self.prompt_logprobs = prompt_logprobs
+    def __repr__(self) -> str:
+        return (
+            f"SequenceGroupOutput(samples={self.samples}, "
+            f"prompt_logprobs={self.prompt_logprobs})"
+        )
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, SequenceGroupOutput):
+            raise NotImplementedError()
+        return (
+            self.samples == other.samples
+            and self.prompt_logprobs == other.prompt_logprobs
+        )
+# For each sequence group, we generate a list of SequenceOutput object,
+# each of which contains one possible candidate for the next token.
+SamplerOutput = List[SequenceGroupOutput]

ChatTTS/model/velocity/worker.py ADDED Viewed

	@@ -0,0 +1,251 @@

+"""A GPU worker class."""
+import os
+from typing import Dict, List, Optional, Tuple
+import torch
+import torch.distributed
+from vllm.config import CacheConfig, ModelConfig, ParallelConfig, SchedulerConfig
+from vllm.model_executor import set_random_seed
+from vllm.model_executor.parallel_utils.communication_op import broadcast_object_list
+from vllm.model_executor.parallel_utils.parallel_state import initialize_model_parallel
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.worker.cache_engine import CacheEngine
+from .model_runner import ModelRunner
+class Worker:
+    """A worker class that executes (a partition of) the model on a GPU.
+    Each worker is associated with a single GPU. The worker is responsible for
+    maintaining the KV cache and executing the model on the GPU. In case of
+    distributed inference, each worker is assigned a partition of the model.
+    """
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        post_model_path: str,
+        is_driver_worker: bool = False,
+    ) -> None:
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.local_rank = local_rank
+        self.rank = rank
+        self.distributed_init_method = distributed_init_method
+        self.is_driver_worker = is_driver_worker
+        self.post_model_path = post_model_path
+        if self.is_driver_worker:
+            assert self.rank == 0, "The driver worker must have rank 0."
+        self.model_runner = ModelRunner(
+            model_config,
+            parallel_config,
+            scheduler_config,
+            is_driver_worker,
+            post_model_path,
+        )
+        # Uninitialized cache engine. Will be initialized by
+        # self.init_cache_engine().
+        self.cache_config = None
+        self.cache_engine = None
+        self.cache_events = None
+        self.gpu_cache = None
+    def init_model(self) -> None:
+        # torch.distributed.all_reduce does not free the input tensor until
+        # the synchronization point. This causes the memory usage to grow
+        # as the number of all_reduce calls increases. This env var disables
+        # this behavior.
+        # Related issue:
+        # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
+        os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
+        # This env var set by Ray causes exceptions with graph building.
+        os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
+        self.device = torch.device(f"cuda:{self.local_rank}")
+        torch.cuda.set_device(self.device)
+        _check_if_gpu_supports_dtype(self.model_config.dtype)
+        # Initialize the distributed environment.
+        _init_distributed_environment(
+            self.parallel_config, self.rank, self.distributed_init_method
+        )
+        # Initialize the model.
+        set_random_seed(self.model_config.seed)
+    def load_model(self):
+        self.model_runner.load_model()
+    @torch.inference_mode()
+    def profile_num_available_blocks(
+        self,
+        block_size: int,
+        gpu_memory_utilization: float,
+        cpu_swap_space: int,
+    ) -> Tuple[int, int]:
+        # Profile the memory usage of the model and get the maximum number of
+        # cache blocks that can be allocated with the remaining free memory.
+        torch.cuda.empty_cache()
+        # Execute a forward pass with dummy inputs to profile the memory usage
+        # of the model.
+        self.model_runner.profile_run()
+        # Calculate the number of blocks that can be allocated with the
+        # profiled peak memory.
+        torch.cuda.synchronize()
+        free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
+        peak_memory = total_gpu_memory - free_gpu_memory
+        cache_block_size = CacheEngine.get_cache_block_size(
+            block_size, self.model_config, self.parallel_config
+        )
+        num_gpu_blocks = int(
+            (total_gpu_memory * gpu_memory_utilization - peak_memory)
+            // cache_block_size
+        )
+        num_cpu_blocks = int(cpu_swap_space // cache_block_size)
+        num_gpu_blocks = max(num_gpu_blocks, 0)
+        num_cpu_blocks = max(num_cpu_blocks, 0)
+        torch.cuda.empty_cache()
+        return num_gpu_blocks, num_cpu_blocks
+    def init_cache_engine(self, cache_config: CacheConfig) -> None:
+        self.cache_config = cache_config
+        self.cache_engine = CacheEngine(
+            self.cache_config, self.model_config, self.parallel_config
+        )
+        self.cache_events = self.cache_engine.events
+        self.gpu_cache = self.cache_engine.gpu_cache
+        self.model_runner.set_block_size(self.cache_engine.block_size)
+    def warm_up_model(self) -> None:
+        if not self.model_config.enforce_eager:
+            self.model_runner.capture_model(self.gpu_cache)
+        # Reset the seed to ensure that the random state is not affected by
+        # the model initialization and profiling.
+        set_random_seed(self.model_config.seed)
+    def cache_swap(
+        self,
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+    ) -> None:
+        # Issue cache operations.
+        issued_cache_op = False
+        if blocks_to_swap_in:
+            self.cache_engine.swap_in(blocks_to_swap_in)
+            issued_cache_op = True
+        if blocks_to_swap_out:
+            self.cache_engine.swap_out(blocks_to_swap_out)
+            issued_cache_op = True
+        if blocks_to_copy:
+            self.cache_engine.copy(blocks_to_copy)
+            issued_cache_op = True
+        cache_events = self.cache_events if issued_cache_op else None
+        # Wait for cache operations to finish.
+        # TODO(woosuk): Profile swapping overhead and optimize if needed.
+        if cache_events is not None:
+            for event in cache_events:
+                event.wait()
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None,
+        blocks_to_swap_in: Optional[Dict[int, int]] = None,
+        blocks_to_swap_out: Optional[Dict[int, int]] = None,
+        blocks_to_copy: Optional[Dict[int, List[int]]] = None,
+    ) -> Optional[SamplerOutput]:
+        if self.is_driver_worker:
+            assert seq_group_metadata_list is not None
+            num_seq_groups = len(seq_group_metadata_list)
+            assert blocks_to_swap_in is not None
+            assert blocks_to_swap_out is not None
+            assert blocks_to_copy is not None
+            block_swapping_info = [
+                blocks_to_swap_in,
+                blocks_to_swap_out,
+                blocks_to_copy,
+            ]
+            broadcast_object_list([num_seq_groups] + block_swapping_info, src=0)
+        else:
+            # num_seq_groups, blocks_to_swap_in, blocks_to_swap_out,
+            # blocks_to_copy (4 elements)
+            recv_data = [None] * 4
+            broadcast_object_list(recv_data, src=0)
+            num_seq_groups = recv_data[0]
+            block_swapping_info = recv_data[1:]
+        self.cache_swap(*block_swapping_info)
+        # If there is no input, we don't need to execute the model.
+        if num_seq_groups == 0:
+            return {}
+        output = self.model_runner.execute_model(
+            seq_group_metadata_list, self.gpu_cache
+        )
+        return output
+def _init_distributed_environment(
+    parallel_config: ParallelConfig,
+    rank: int,
+    distributed_init_method: Optional[str] = None,
+) -> None:
+    """Initialize the distributed environment."""
+    if torch.distributed.is_initialized():
+        torch_world_size = torch.distributed.get_world_size()
+        if torch_world_size != parallel_config.world_size:
+            raise RuntimeError(
+                "torch.distributed is already initialized but the torch world "
+                "size does not match parallel_config.world_size "
+                f"({torch_world_size} vs. {parallel_config.world_size})."
+            )
+    elif not distributed_init_method:
+        raise ValueError(
+            "distributed_init_method must be set if torch.distributed "
+            "is not already initialized"
+        )
+    else:
+        torch.distributed.init_process_group(
+            backend="nccl",
+            world_size=parallel_config.world_size,
+            rank=rank,
+            init_method=distributed_init_method,
+        )
+    # A small all_reduce for warmup.
+    torch.distributed.all_reduce(torch.zeros(1).cuda())
+    initialize_model_parallel(
+        parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size
+    )
+def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
+    # Check if the GPU supports the dtype.
+    if torch_dtype == torch.bfloat16:
+        compute_capability = torch.cuda.get_device_capability()
+        if compute_capability[0] < 8:
+            gpu_name = torch.cuda.get_device_name()
+            raise ValueError(
+                "Bfloat16 is only supported on GPUs with compute capability "
+                f"of at least 8.0. Your {gpu_name} GPU has compute capability "
+                f"{compute_capability[0]}.{compute_capability[1]}."
+            )

ChatTTS/norm.py ADDED Viewed

	@@ -0,0 +1,253 @@

+import json
+import logging
+import re
+from typing import Dict, Tuple, List, Literal, Callable, Optional
+import sys
+from numba import jit
+import numpy as np
+from .utils import del_all
+@jit
+def _find_index(table: np.ndarray, val: np.uint16):
+    for i in range(table.size):
+        if table[i] == val:
+            return i
+    return -1
+@jit
+def _fast_replace(
+    table: np.ndarray, text: bytes
+) -> Tuple[np.ndarray, List[Tuple[str, str]]]:
+    result = np.frombuffer(text, dtype=np.uint16).copy()
+    replaced_words = []
+    for i in range(result.size):
+        ch = result[i]
+        p = _find_index(table[0], ch)
+        if p >= 0:
+            repl_char = table[1][p]
+            result[i] = repl_char
+            replaced_words.append((chr(ch), chr(repl_char)))
+    return result, replaced_words
+@jit
+def _split_tags(text: str) -> Tuple[List[str], List[str]]:
+    texts: List[str] = []
+    tags: List[str] = []
+    current_text = ""
+    current_tag = ""
+    for c in text:
+        if c == "[":
+            texts.append(current_text)
+            current_text = ""
+            current_tag = c
+        elif current_tag != "":
+            current_tag += c
+        else:
+            current_text += c
+        if c == "]":
+            tags.append(current_tag)
+            current_tag = ""
+    if current_text != "":
+        texts.append(current_text)
+    return texts, tags
+@jit
+def _combine_tags(texts: List[str], tags: List[str]) -> str:
+    text = ""
+    for t in texts:
+        tg = ""
+        if len(tags) > 0:
+            tg = tags.pop(0)
+        text += t + tg
+    return text
+class Normalizer:
+    def __init__(self, map_file_path: str, logger=logging.getLogger(__name__)):
+        self.logger = logger
+        self.normalizers: Dict[str, Callable[[str], str]] = {}
+        self.homophones_map = self._load_homophones_map(map_file_path)
+        """
+        homophones_map
+        Replace the mispronounced characters with correctly pronounced ones.
+        Creation process of homophones_map.json:
+        1. Establish a word corpus using the [Tencent AI Lab Embedding Corpora v0.2.0 large] with 12 million entries. After cleaning, approximately 1.8 million entries remain. Use ChatTTS to infer the text.
+        2. Record discrepancies between the inferred and input text, identifying about 180,000 misread words.
+        3. Create a pinyin to common characters mapping using correctly read characters by ChatTTS.
+        4. For each discrepancy, extract the correct pinyin using [python-pinyin] and find homophones with the correct pronunciation from the mapping.
+        Thanks to:
+        [Tencent AI Lab Embedding Corpora for Chinese and English Words and Phrases](https://ai.tencent.com/ailab/nlp/en/embedding.html)
+        [python-pinyin](https://github.com/mozillazg/python-pinyin)
+        """
+        self.coding = "utf-16-le" if sys.byteorder == "little" else "utf-16-be"
+        self.reject_pattern = re.compile(r"[^\u4e00-\u9fffA-Za-z，。、,\. ]")
+        self.sub_pattern = re.compile(r"\[[\w_]+\]")
+        self.chinese_char_pattern = re.compile(r"[\u4e00-\u9fff]")
+        self.english_word_pattern = re.compile(r"\b[A-Za-z]+\b")
+        self.character_simplifier = str.maketrans(
+            {
+                "：": "，",
+                "；": "，",
+                "！": "。",
+                "（": "，",
+                "）": "，",
+                "【": "，",
+                "】": "，",
+                "『": "，",
+                "』": "，",
+                "「": "，",
+                "」": "，",
+                "《": "，",
+                "》": "，",
+                "－": "，",
+                ":": ",",
+                ";": ",",
+                "!": ".",
+                "(": ",",
+                ")": ",",
+                # "[": ",",
+                # "]": ",",
+                ">": ",",
+                "<": ",",
+                "-": ",",
+            }
+        )
+        self.halfwidth_2_fullwidth = str.maketrans(
+            {
+                "!": "！",
+                '"': "“",
+                "'": "‘",
+                "#": "＃",
+                "$": "＄",
+                "%": "％",
+                "&": "＆",
+                "(": "（",
+                ")": "）",
+                ",": "，",
+                "-": "－",
+                "*": "＊",
+                "+": "＋",
+                ".": "。",
+                "/": "／",
+                ":": "：",
+                ";": "；",
+                "<": "＜",
+                "=": "＝",
+                ">": "＞",
+                "?": "？",
+                "@": "＠",
+                # '[': '［',
+                "\\": "＼",
+                # ']': '］',
+                "^": "＾",
+                # '_': '＿',
+                "`": "｀",
+                "{": "｛",
+                "|": "｜",
+                "}": "｝",
+                "~": "～",
+            }
+        )
+    def __call__(
+        self,
+        text: str,
+        do_text_normalization=True,
+        do_homophone_replacement=True,
+        lang: Optional[Literal["zh", "en"]] = None,
+    ) -> str:
+        if do_text_normalization:
+            _lang = self._detect_language(text) if lang is None else lang
+            if _lang in self.normalizers:
+                texts, tags = _split_tags(text)
+                self.logger.debug("split texts %s, tags %s", str(texts), str(tags))
+                texts = [self.normalizers[_lang](t) for t in texts]
+                self.logger.debug("normed texts %s", str(texts))
+                text = _combine_tags(texts, tags) if len(tags) > 0 else texts[0]
+                self.logger.debug("combined text %s", text)
+            if _lang == "zh":
+                text = self._apply_half2full_map(text)
+        invalid_characters = self._count_invalid_characters(text)
+        if len(invalid_characters):
+            self.logger.warning(f"found invalid characters: {invalid_characters}")
+            text = self._apply_character_map(text)
+        if do_homophone_replacement:
+            arr, replaced_words = _fast_replace(
+                self.homophones_map,
+                text.encode(self.coding),
+            )
+            if replaced_words:
+                text = arr.tobytes().decode(self.coding)
+                repl_res = ", ".join([f"{_[0]}->{_[1]}" for _ in replaced_words])
+                self.logger.info(f"replace homophones: {repl_res}")
+        if len(invalid_characters):
+            texts, tags = _split_tags(text)
+            self.logger.debug("split texts %s, tags %s", str(texts), str(tags))
+            texts = [self.reject_pattern.sub("", t) for t in texts]
+            self.logger.debug("normed texts %s", str(texts))
+            text = _combine_tags(texts, tags) if len(tags) > 0 else texts[0]
+            self.logger.debug("combined text %s", text)
+        return text
+    def register(self, name: str, normalizer: Callable[[str], str]) -> bool:
+        if name in self.normalizers:
+            self.logger.warning(f"name {name} has been registered")
+            return False
+        try:
+            val = normalizer("test string 测试字符串")
+            if not isinstance(val, str):
+                self.logger.warning("normalizer must have caller type (str) -> str")
+                return False
+        except Exception as e:
+            self.logger.warning(e)
+            return False
+        self.normalizers[name] = normalizer
+        return True
+    def unregister(self, name: str):
+        if name in self.normalizers:
+            del self.normalizers[name]
+    def destroy(self):
+        del_all(self.normalizers)
+        del self.homophones_map
+    def _load_homophones_map(self, map_file_path: str) -> np.ndarray:
+        with open(map_file_path, "r", encoding="utf-8") as f:
+            homophones_map: Dict[str, str] = json.load(f)
+        map = np.empty((2, len(homophones_map)), dtype=np.uint32)
+        for i, k in enumerate(homophones_map.keys()):
+            map[:, i] = (ord(k), ord(homophones_map[k]))
+        del homophones_map
+        return map
+    def _count_invalid_characters(self, s: str):
+        s = self.sub_pattern.sub("", s)
+        non_alphabetic_chinese_chars = self.reject_pattern.findall(s)
+        return set(non_alphabetic_chinese_chars)
+    def _apply_half2full_map(self, text: str) -> str:
+        return text.translate(self.halfwidth_2_fullwidth)
+    def _apply_character_map(self, text: str) -> str:
+        return text.translate(self.character_simplifier)
+    def _detect_language(self, sentence: str) -> Literal["zh", "en"]:
+        chinese_chars = self.chinese_char_pattern.findall(sentence)
+        english_words = self.english_word_pattern.findall(sentence)
+        if len(chinese_chars) > len(english_words):
+            return "zh"
+        else:
+            return "en"

ChatTTS/res/__init__.py ADDED Viewed

File without changes

ChatTTS/res/homophones_map.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ChatTTS/res/sha256_map.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+	"sha256_asset_Decoder_pt"        : "9964e36e840f0e3a748c5f716fe6de6490d2135a5f5155f4a642d51860e2ec38",
+	"sha256_asset_DVAE_full_pt"      : "553eb75763511e23f3e5f86303e2163c5ca775489d637fb635d979c8ae58bbe5",
+	"sha256_asset_Embed_safetensors" : "2ff0be7134934155741b643b74e32fb6bf3eec41257984459b2ed60cdb4c48b0",
+	"sha256_asset_Vocos_pt"          : "09a670eda1c08b740013679c7a90ebb7f1a97646ea7673069a6838e6b51d6c58",
+	"sha256_asset_gpt_config_json"         : "0aaa1ecd96c49ad4f473459eb1982fa7ad79fa5de08cde2781bf6ad1f9a0c236",
+	"sha256_asset_gpt_model_safetensors"   : "cd0806fd971f52f6a22c923ec64982b305e817bcc41ca83417fcf9141b984a0f",
+	"sha256_asset_tokenizer_special_tokens_map_json": "bd0ac9d9bb1657996b5c5fbcaa7d80f8de530d01a283da97f89deae5b1b8d011",
+	"sha256_asset_tokenizer_tokenizer_config_json"  : "43e9d658b554fa5ee8d8e1d763349323bfef1ed7a89c0794220ab8861387d421",
+	"sha256_asset_tokenizer_tokenizer_json"         : "843838a64e121e23e774cc75874c6fe862198d9f7dd43747914633a8fd89c20e"
+}

ChatTTS/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .dl import check_all_assets, download_all_assets
+from .gpu import select_device
+from .io import get_latest_modified_file, del_all
+from .log import logger

ChatTTS/utils/dl.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import os
+from pathlib import Path
+import hashlib
+import requests
+from io import BytesIO
+from typing import Dict, Tuple, Optional
+from mmap import mmap, ACCESS_READ
+from .log import logger
+def sha256(fileno: int) -> str:
+    data = mmap(fileno, 0, access=ACCESS_READ)
+    h = hashlib.sha256(data).hexdigest()
+    del data
+    return h
+def check_model(
+    dir_name: Path, model_name: str, hash: str, remove_incorrect=False
+) -> bool:
+    target = dir_name / model_name
+    relname = target.as_posix()
+    logger.get_logger().debug(f"checking {relname}...")
+    if not os.path.exists(target):
+        logger.get_logger().info(f"{target} not exist.")
+        return False
+    with open(target, "rb") as f:
+        digest = sha256(f.fileno())
+        bakfile = f"{target}.bak"
+        if digest != hash:
+            logger.get_logger().warning(f"{target} sha256 hash mismatch.")
+            logger.get_logger().info(f"expected: {hash}")
+            logger.get_logger().info(f"real val: {digest}")
+            if remove_incorrect:
+                if not os.path.exists(bakfile):
+                    os.rename(str(target), bakfile)
+                else:
+                    os.remove(str(target))
+            return False
+        if remove_incorrect and os.path.exists(bakfile):
+            os.remove(bakfile)
+    return True
+def check_folder(
+    base_dir: Path,
+    *innder_dirs: str,
+    names: Tuple[str],
+    sha256_map: Dict[str, str],
+    update=False,
+) -> bool:
+    key = "sha256_"
+    current_dir = base_dir
+    for d in innder_dirs:
+        current_dir /= d
+        key += f"{d}_"
+    for model in names:
+        menv = model.replace(".", "_")
+        if not check_model(current_dir, model, sha256_map[f"{key}{menv}"], update):
+            return False
+    return True
+def check_all_assets(base_dir: Path, sha256_map: Dict[str, str], update=False) -> bool:
+    logger.get_logger().info("checking assets...")
+    if not check_folder(
+        base_dir,
+        "asset",
+        names=(
+            "Decoder.pt",
+            "DVAE_full.pt",
+            "Embed.safetensors",
+            "Vocos.pt",
+        ),
+        sha256_map=sha256_map,
+        update=update,
+    ):
+        return False
+    if not check_folder(
+        base_dir,
+        "asset",
+        "gpt",
+        names=(
+            "config.json",
+            "model.safetensors",
+        ),
+        sha256_map=sha256_map,
+        update=update,
+    ):
+        return False
+    if not check_folder(
+        base_dir,
+        "asset",
+        "tokenizer",
+        names=(
+            "special_tokens_map.json",
+            "tokenizer_config.json",
+            "tokenizer.json",
+        ),
+        sha256_map=sha256_map,
+        update=update,
+    ):
+        return False
+    logger.get_logger().info("all assets are already latest.")
+    return True
+def download_and_extract_tar_gz(
+    url: str, folder: str, headers: Optional[Dict[str, str]] = None
+):
+    import tarfile
+    logger.get_logger().info(f"downloading {url}")
+    response = requests.get(url, headers=headers, stream=True, timeout=(10, 3))
+    with BytesIO() as out_file:
+        out_file.write(response.content)
+        out_file.seek(0)
+        logger.get_logger().info(f"downloaded.")
+        with tarfile.open(fileobj=out_file, mode="r:gz") as tar:
+            tar.extractall(folder)
+        logger.get_logger().info(f"extracted into {folder}")
+def download_and_extract_zip(
+    url: str, folder: str, headers: Optional[Dict[str, str]] = None
+):
+    import zipfile
+    logger.get_logger().info(f"downloading {url}")
+    response = requests.get(url, headers=headers, stream=True, timeout=(10, 3))
+    with BytesIO() as out_file:
+        out_file.write(response.content)
+        out_file.seek(0)
+        logger.get_logger().info(f"downloaded.")
+        with zipfile.ZipFile(out_file) as zip_ref:
+            zip_ref.extractall(folder)
+        logger.get_logger().info(f"extracted into {folder}")
+def download_dns_yaml(url: str, folder: str, headers: Dict[str, str]):
+    logger.get_logger().info(f"downloading {url}")
+    response = requests.get(url, headers=headers, stream=True, timeout=(100, 3))
+    with open(os.path.join(folder, "dns.yaml"), "wb") as out_file:
+        out_file.write(response.content)
+        logger.get_logger().info(f"downloaded into {folder}")
+def download_all_assets(tmpdir: str, version="0.2.8"):
+    import subprocess
+    import platform
+    archs = {
+        "aarch64": "arm64",
+        "armv8l": "arm64",
+        "arm64": "arm64",
+        "x86": "386",
+        "i386": "386",
+        "i686": "386",
+        "386": "386",
+        "x86_64": "amd64",
+        "x64": "amd64",
+        "amd64": "amd64",
+    }
+    system_type = platform.system().lower()
+    architecture = platform.machine().lower()
+    is_win = system_type == "windows"
+    architecture = archs.get(architecture, None)
+    if not architecture:
+        logger.get_logger().error(f"architecture {architecture} is not supported")
+        exit(1)
+    try:
+        BASE_URL = "https://github.com/fumiama/RVC-Models-Downloader/releases/download/"
+        suffix = "zip" if is_win else "tar.gz"
+        RVCMD_URL = BASE_URL + f"v{version}/rvcmd_{system_type}_{architecture}.{suffix}"
+        cmdfile = os.path.join(tmpdir, "rvcmd")
+        if is_win:
+            download_and_extract_zip(RVCMD_URL, tmpdir)
+            cmdfile += ".exe"
+        else:
+            download_and_extract_tar_gz(RVCMD_URL, tmpdir)
+            os.chmod(cmdfile, 0o755)
+        subprocess.run([cmdfile, "-notui", "-w", "0", "assets/chtts"])
+    except Exception:
+        BASE_URL = (
+            "https://gitea.seku.su/fumiama/RVC-Models-Downloader/releases/download/"
+        )
+        suffix = "zip" if is_win else "tar.gz"
+        RVCMD_URL = BASE_URL + f"v{version}/rvcmd_{system_type}_{architecture}.{suffix}"
+        download_dns_yaml(
+            "https://gitea.seku.su/fumiama/RVC-Models-Downloader/raw/branch/main/dns.yaml",
+            tmpdir,
+            headers={
+                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
+            },
+        )
+        cmdfile = os.path.join(tmpdir, "rvcmd")
+        if is_win:
+            download_and_extract_zip(RVCMD_URL, tmpdir)
+            cmdfile += ".exe"
+        else:
+            download_and_extract_tar_gz(RVCMD_URL, tmpdir)
+            os.chmod(cmdfile, 0o755)
+        subprocess.run(
+            [
+                cmdfile,
+                "-notui",
+                "-w",
+                "0",
+                "-dns",
+                os.path.join(tmpdir, "dns.yaml"),
+                "assets/chtts",
+            ]
+        )

ChatTTS/utils/gpu.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch
+from .log import logger
+def select_device(min_memory=2047, experimental=False):
+    if torch.cuda.is_available():
+        selected_gpu = 0
+        max_free_memory = -1
+        for i in range(torch.cuda.device_count()):
+            props = torch.cuda.get_device_properties(i)
+            free_memory = props.total_memory - torch.cuda.memory_reserved(i)
+            if max_free_memory < free_memory:
+                selected_gpu = i
+                max_free_memory = free_memory
+        free_memory_mb = max_free_memory / (1024 * 1024)
+        if free_memory_mb < min_memory:
+            logger.get_logger().warning(
+                f"GPU {selected_gpu} has {round(free_memory_mb, 2)} MB memory left. Switching to CPU."
+            )
+            device = torch.device("cpu")
+        else:
+            device = torch.device(f"cuda:{selected_gpu}")
+    elif torch.backends.mps.is_available():
+        """
+        Currently MPS is slower than CPU while needs more memory and core utility,
+        so only enable this for experimental use.
+        """
+        if experimental:
+            # For Apple M1/M2 chips with Metal Performance Shaders
+            logger.get_logger().warning("experimantal: found apple GPU, using MPS.")
+            device = torch.device("mps")
+        else:
+            logger.get_logger().info("found Apple GPU, but use CPU.")
+            device = torch.device("cpu")
+    else:
+        logger.get_logger().warning("no GPU found, use CPU instead")
+        device = torch.device("cpu")
+    return device

ChatTTS/utils/io.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+import logging
+from typing import Union
+from dataclasses import is_dataclass
+from .log import logger
+def get_latest_modified_file(directory):
+    files = [os.path.join(directory, f) for f in os.listdir(directory)]
+    if not files:
+        logger.get_logger().log(
+            logging.WARNING, f"no files found in the directory: {directory}"
+        )
+        return None
+    latest_file = max(files, key=os.path.getmtime)
+    return latest_file
+def del_all(d: Union[dict, list]):
+    if is_dataclass(d):
+        for k in list(vars(d).keys()):
+            x = getattr(d, k)
+            if isinstance(x, dict) or isinstance(x, list) or is_dataclass(x):
+                del_all(x)
+            del x
+            delattr(d, k)
+    elif isinstance(d, dict):
+        lst = list(d.keys())
+        for k in lst:
+            x = d.pop(k)
+            if isinstance(x, dict) or isinstance(x, list) or is_dataclass(x):
+                del_all(x)
+            del x
+    elif isinstance(d, list):
+        while len(d):
+            x = d.pop()
+            if isinstance(x, dict) or isinstance(x, list) or is_dataclass(x):
+                del_all(x)
+            del x
+    else:
+        del d

ChatTTS/utils/log.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import logging
+from pathlib import Path
+class Logger:
+    def __init__(self, logger=logging.getLogger(Path(__file__).parent.name)):
+        self.logger = logger
+    def set_logger(self, logger: logging.Logger):
+        self.logger = logger
+    def get_logger(self) -> logging.Logger:
+        return self.logger
+logger = Logger()

Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["fastapi", "dev", "examples/api/main.py", "--host", "0.0.0.0", "--port", "7860"]

LICENSE ADDED Viewed

	@@ -0,0 +1,661 @@

+                    GNU AFFERO GENERAL PUBLIC LICENSE
+                       Version 3, 19 November 2007
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+                            Preamble
+  The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+  Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+  A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate.  Many developers of free software are heartened and
+encouraged by the resulting cooperation.  However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+  The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community.  It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server.  Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+  An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals.  This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+  The precise terms and conditions for copying, distribution and
+modification follow.
+                       TERMS AND CONDITIONS
+  0. Definitions.
+  "This License" refers to version 3 of the GNU Affero General Public License.
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+  1. Source Code.
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+  The Corresponding Source for a work in source code form is that
+same work.
+  2. Basic Permissions.
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+  4. Conveying Verbatim Copies.
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+  5. Conveying Modified Source Versions.
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+  6. Conveying Non-Source Forms.
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+  7. Additional Terms.
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+  8. Termination.
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+  9. Acceptance Not Required for Having Copies.
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+  10. Automatic Licensing of Downstream Recipients.
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+  11. Patents.
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+  12. No Surrender of Others' Freedom.
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+  13. Remote Network Interaction; Use with the GNU General Public License.
+  Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software.  This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+  14. Revised Versions of this License.
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time.  Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+  If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+  15. Disclaimer of Warranty.
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+  16. Limitation of Liability.
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+  17. Interpretation of Sections 15 and 16.
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+                     END OF TERMS AND CONDITIONS
+            How to Apply These Terms to Your New Programs
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as published
+    by the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+    You should have received a copy of the GNU Affero General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+Also add information on how to contact you by electronic and paper mail.
+  If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source.  For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code.  There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+<https://www.gnu.org/licenses/>.

docs/cn/README.md ADDED Viewed

	@@ -0,0 +1,314 @@

+<div align="center">
+<a href="https://trendshift.io/repositories/10489" target="_blank"><img src="https://trendshift.io/api/badge/repositories/10489" alt="2noise%2FChatTTS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+# ChatTTS
+一款适用于日常对话的生成式语音模型。
+[![Licence](https://img.shields.io/github/license/2noise/ChatTTS?style=for-the-badge)](https://github.com/2noise/ChatTTS/blob/main/LICENSE)
+[![PyPI](https://img.shields.io/pypi/v/ChatTTS.svg?style=for-the-badge&color=green)](https://pypi.org/project/ChatTTS)
+[![Huggingface](https://img.shields.io/badge/🤗%20-Models-yellow.svg?style=for-the-badge)](https://huggingface.co/2Noise/ChatTTS)
+[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/2noise/ChatTTS/blob/main/examples/ipynb/colab.ipynb)
+[![Discord](https://img.shields.io/badge/Discord-7289DA?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/Ud5Jxgx5yD)
+[**English**](../../README.md) | **简体中文** | [**日本語**](../jp/README.md) | [**Русский**](../ru/README.md) | [**Español**](../es/README.md) | [**Français**](../fr/README.md)
+</div>
+> [!NOTE]
+> 注意此版本可能不是最新版，所有内容请以英文版为准。
+## 简介
+> [!Note]
+> 这个仓库包含算法架构和一些简单的示例。
+> [!Tip]
+> 由本仓库衍生出的用户端产品，请参见由社区维护的索引仓库  [Awesome-ChatTTS](https://github.com/libukai/Awesome-ChatTTS)。
+ChatTTS 是一款专门为对话场景（例如 LLM 助手）设计的文本转语音模型。
+### 支持的语种
+- [x] 英语
+- [x] 中文
+- [ ] 敬请期待...
+### 亮点
+> 你可以参考 **[Bilibili](https://www.bilibili.com/video/BV1zn4y1o7iV)** 上的这个视频，了解本项目的详细情况。
+1. **对话式 TTS**: ChatTTS 针对对话式任务进行了优化，能够实现自然且富有表现力的合成语音。它支持多个说话者，便于生成互动式对话。
+2. **精细的控制**: 该模型可以预测和控制精细的韵律特征，包括笑声、停顿和插入语。
+3. **更好的韵律**: ChatTTS 在韵律方面超越了大多数开源 TTS 模型。我们提供预训练模型以支持进一步的研究和开发。
+### 数据集和模型
+- 主模型使用了 100,000+ 小时的中文和英文音频数据进行训练。
+- **[HuggingFace](https://huggingface.co/2Noise/ChatTTS)** 上的开源版本是一个在 40,000 小时数据上进行无监督微调的预训练模型。
+### 路线图
+- [x] 开源 4 万小时基础模型和 spk_stats 文件。
+- [x] 支持流式语音输出。
+- [ ] 开源具有多情感控制功能的 4 万小时版本。
+- [ ] ChatTTS.cpp (欢迎在 2noise 组织中新建仓库)。
+### 免责声明
+> [!Important]
+> 此仓库仅供学术用途。
+本项目旨在用于教育和研究目的，不适用于任何商业或法律目的。作者不保证信息的准确性、完整性和可靠性。此仓库中使用的信息和数据仅供学术和研究目的。数据来自公开来源，作者不声称对数据拥有任何所有权或版权。
+ChatTTS 是一款强大的文本转语音系统。但是，负责任和道德地使用这项技术非常重要。为了限制 ChatTTS 的使用，我们在 40,000 小时模型的训练过程中添加了少量高频噪声，并使用 MP3 格式尽可能压缩音频质量，以防止恶意行为者将其用于犯罪目的。同时，我们内部训练了一个检测模型，并计划在未来开源它。
+### 联系方式
+> 欢迎随时提交 GitHub issues/PRs。
+#### 合作洽谈
+如需就模型和路线图进行合作洽谈，请发送邮件至 **[email protected]**。
+#### 线上讨论
+##### 1. 官方 QQ 群
+- **群 1**, 808364215 (已满)
+- **群 2**, 230696694 (已满)
+- **群 3**, 933639842 (已满)
+- **群 4**, 608667975
+##### 2. Discord
+点击加入 [Discord](https://discord.gg/Ud5Jxgx5yD)。
+## 体验教程
+### 克隆仓库
+```bash
+git clone https://github.com/2noise/ChatTTS
+cd ChatTTS
+```
+### 安装依赖
+#### 1. 直接安装
+```bash
+pip install --upgrade -r requirements.txt
+```
+#### 2. 使用 conda 安装
+```bash
+conda create -n chattts
+conda activate chattts
+pip install -r requirements.txt
+```
+#### 可选 : 如果使用 NVIDIA GPU（仅限 Linux），可安装 TransformerEngine。
+> [!Note]
+> 安装过程可能耗时很长。
+> [!Warning]
+> TransformerEngine 的适配目前正在开发中，运行时可能会遇到较多问题。仅推荐出于开发目的安装。
+```bash
+pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
+```
+#### 可选 : 安装 FlashAttention-2 (主要适用于 NVIDIA GPU)
+> [!Note]
+> 支持设备列表详见 [Hugging Face Doc](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2).
+```bash
+pip install flash-attn --no-build-isolation
+```
+### 快速启动
+> 确保在执行以下命令时，处于项目根目录下。
+#### 1. WebUI 可视化界面
+```bash
+python examples/web/webui.py
+```
+#### 2. 命令行交互
+> 生成的音频将保存至 `./output_audio_n.mp3`
+```bash
+python examples/cmd/run.py "Your text 1." "Your text 2."
+```
+## 开发教程
+### 安装 Python 包
+1. 从 PyPI 安装稳定版
+```bash
+pip install ChatTTS
+```
+2. 从 GitHub 安装最新版
+```bash
+pip install git+https://github.com/2noise/ChatTTS
+```
+3. 从本地文件夹安装开发版
+```bash
+pip install -e .
+```
+### 基础用法
+```python
+import ChatTTS
+import torch
+import torchaudio
+chat = ChatTTS.Chat()
+chat.load(compile=False) # Set to True for better performance
+texts = ["PUT YOUR 1st TEXT HERE", "PUT YOUR 2nd TEXT HERE"]
+wavs = chat.infer(texts)
+torchaudio.save("output1.wav", torch.from_numpy(wavs[0]), 24000)
+```
+### 进阶用法
+```python
+###################################
+# Sample a speaker from Gaussian.
+rand_spk = chat.sample_random_speaker()
+print(rand_spk) # save it for later timbre recovery
+params_infer_code = ChatTTS.Chat.InferCodeParams(
+    spk_emb = rand_spk, # add sampled speaker
+    temperature = .3,   # using custom temperature
+    top_P = 0.7,        # top P decode
+    top_K = 20,         # top K decode
+)
+###################################
+# For sentence level manual control.
+# use oral_(0-9), laugh_(0-2), break_(0-7)
+# to generate special token in text to synthesize.
+params_refine_text = ChatTTS.Chat.RefineTextParams(
+    prompt='[oral_2][laugh_0][break_6]',
+)
+wavs = chat.infer(
+    texts,
+    params_refine_text=params_refine_text,
+    params_infer_code=params_infer_code,
+)
+###################################
+# For word level manual control.
+text = 'What is [uv_break]your favorite english food?[laugh][lbreak]'
+wavs = chat.infer(text, skip_refine_text=True, params_refine_text=params_refine_text,  params_infer_code=params_infer_code)
+torchaudio.save("output2.wav", torch.from_numpy(wavs[0]), 24000)
+```
+<details open>
+  <summary><h4>示例: 自我介绍</h4></summary>
+```python
+inputs_en = """
+chatTTS is a text to speech model designed for dialogue applications.
+[uv_break]it supports mixed language input [uv_break]and offers multi speaker
+capabilities with precise control over prosodic elements like
+[uv_break]laughter[uv_break][laugh], [uv_break]pauses, [uv_break]and intonation.
+[uv_break]it delivers natural and expressive speech,[uv_break]so please
+[uv_break] use the project responsibly at your own risk.[uv_break]
+""".replace('\n', '') # English is still experimental.
+params_refine_text = ChatTTS.Chat.RefineTextParams(
+    prompt='[oral_2][laugh_0][break_4]',
+)
+audio_array_en = chat.infer(inputs_en, params_refine_text=params_refine_text)
+torchaudio.save("output3.wav", torch.from_numpy(audio_array_en[0]), 24000)
+```
+<table>
+<tr>
+<td align="center">
+**男性音色**
+</td>
+<td align="center">
+**女性音色**
+</td>
+</tr>
+<tr>
+<td align="center">
+[男性音色](https://github.com/2noise/ChatTTS/assets/130631963/e0f51251-db7f-4d39-a0e9-3e095bb65de1)
+</td>
+<td align="center">
+[女性音色](https://github.com/2noise/ChatTTS/assets/130631963/f5dcdd01-1091-47c5-8241-c4f6aaaa8bbd)
+</td>
+</tr>
+</table>
+</details>
+## 常见问题
+#### 1. 我需要多少 VRAM？ 推理速度如何？
+对于 30 秒的音频片段，至少需要 4GB 的 GPU 内存。 对于 4090 GPU，它可以每秒生成大约 7 个语义 token 对应的音频。实时因子 (RTF) 约为 0.3。
+#### 2. 模型稳定性不够好，存在多个说话者或音频质量差等问题。
+这是一个通常发生在自回归模型（例如 bark 和 valle）中的问题，通常很难避免。可以尝试多个样本以找到合适的结果。
+#### 3. 除了笑声，我们还能控制其他东西吗？我们能控制其他情绪吗？
+在当前发布的模型中，可用的 token 级控制单元是 `[laugh]`, `[uv_break]` 和 `[lbreak]`。未来的版本中，我们可能会开源具有更多情绪控制功能的模型。
+## 致谢
+- [bark](https://github.com/suno-ai/bark), [XTTSv2](https://github.com/coqui-ai/TTS) 和 [valle](https://arxiv.org/abs/2301.02111) 通过自回归式系统展示了非凡的 TTS 效果。
+- [fish-speech](https://github.com/fishaudio/fish-speech) 揭示了 GVQ 作为 LLM 建模的音频分词器的能力。
+- [vocos](https://github.com/gemelo-ai/vocos) vocos 被用作预训练声码器。
+## 特别鸣谢
+- [wlu-audio lab](https://audio.westlake.edu.cn/) 对于早期算法实验的支持。
+## 贡献者列表
+[![contributors](https://contrib.rocks/image?repo=2noise/ChatTTS)](https://github.com/2noise/ChatTTS/graphs/contributors)
+## 项目浏览量
+<div align="center">
+![counter](https://counter.seku.su/cmoe?name=chattts&theme=mbs)
+</div>

docs/es/README.md ADDED Viewed

	@@ -0,0 +1,255 @@

+<div align="center">
+<a href="https://trendshift.io/repositories/10489" target="_blank"><img src="https://trendshift.io/api/badge/repositories/10489" alt="2noise%2FChatTTS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+# ChatTTS
+Un modelo de generación de voz para la conversación diaria.
+[![Licence](https://img.shields.io/github/license/2noise/ChatTTS?style=for-the-badge)](https://github.com/2noise/ChatTTS/blob/main/LICENSE)
+[![Huggingface](https://img.shields.io/badge/🤗%20-Models-yellow.svg?style=for-the-badge)](https://huggingface.co/2Noise/ChatTTS)
+[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/2noise/ChatTTS/blob/main/examples/ipynb/colab.ipynb)
+[**English**](../../README.md) | [**简体中文**](../cn/README.md) | [**日本語**](../jp/README.md) | [**Русский**](../ru/README.md) | **Español**
+ | [**Français**](../fr/README.md)
+</div>
+> [!NOTE]
+> Atención, es posible que esta versión no sea la última. Por favor, consulte la versión en inglés para conocer todo el contenido.
+## Introducción
+ChatTTS es un modelo de texto a voz diseñado específicamente para escenarios conversacionales como LLM assistant.
+### Idiomas Soportados
+- [x] Inglés
+- [x] Chino
+- [ ] Manténganse al tanto...
+### Aspectos Destacados
+> Puede consultar **[este video en Bilibili](https://www.bilibili.com/video/BV1zn4y1o7iV)** para obtener una descripción detallada.
+1. **TTS Conversacional**: ChatTTS está optimizado para tareas conversacionales, logrando una síntesis de voz natural y expresiva. Soporta múltiples hablantes, lo que facilita la generación de diálogos interactivos.
+2. **Control Finas**: Este modelo puede predecir y controlar características detalladas de la prosodia, incluyendo risas, pausas e interjecciones.
+3. **Mejor Prosodia**: ChatTTS supera a la mayoría de los modelos TTS de código abierto en cuanto a prosodia. Ofrecemos modelos preentrenados para apoyar estudios y desarrollos adicionales.
+### Conjunto de Datos & Modelo
+- El modelo principal se entrena con más de 100.000 horas de datos de audio en chino e inglés.
+- La versión de código abierto en **[HuggingFace](https://huggingface.co/2Noise/ChatTTS)** es un modelo preentrenado con 40.000 horas, sin SFT.
+### Hoja de Ruta
+- [x] Publicar el modelo base de 40k horas y el archivo spk_stats como código abierto
+- [ ] Publicar los códigos de codificador VQ y entrenamiento de Lora como código abierto
+- [ ] Generación de audio en streaming sin refinar el texto
+- [ ] Publicar la versión de 40k horas con control de múltiples emociones como código abierto
+- [ ] ¿ChatTTS.cpp? (Se aceptan PR o un nuevo repositorio)
+### Descargo de Responsabilidad
+> [!Important]
+> Este repositorio es sólo para fines académicos.
+Este proyecto está destinado a fines educativos y estudios, y no es adecuado para ningún propósito comercial o legal. El autor no garantiza la exactitud, integridad o fiabilidad de la información. La información y los datos utilizados en este repositorio son únicamente para fines académicos y de investigación. Los datos provienen de fuentes públicas, y el autor no reclama ningún derecho de propiedad o copyright sobre ellos.
+ChatTTS es un potente sistema de conversión de texto a voz. Sin embargo, es crucial utilizar esta tecnología de manera responsable y ética. Para limitar el uso de ChatTTS, hemos añadido una pequeña cantidad de ruido de alta frecuencia durante el proceso de entrenamiento del modelo de 40.000 horas y hemos comprimido la calidad del audio en formato MP3 tanto como sea posible para evitar que actores malintencionados lo usen con fines delictivos. Además, hemos entrenado internamente un modelo de detección y planeamos hacerlo de código abierto en el futuro.
+### Contacto
+> No dudes en enviar issues/PRs de GitHub.
+#### Consultas Formales
+Si desea discutir la cooperación sobre modelos y hojas de ruta, envíe un correo electrónico a **[email protected]**.
+#### Chat en Línea
+##### 1. Grupo QQ (Aplicación Social China)
+- **Grupo 1**, 808364215 (Lleno)
+- **Grupo 2**, 230696694 (Lleno)
+- **Grupo 3**, 933639842
+## Instalación (En Proceso)
+> Se cargará en pypi pronto según https://github.com/2noise/ChatTTS/issues/269.
+```bash
+pip install git+https://github.com/2noise/ChatTTS
+```
+## Inicio
+### Clonar el repositorio
+```bash
+git clone https://github.com/2noise/ChatTTS
+cd ChatTTS
+```
+### Requerimientos de instalación
+#### 1. Instalar directamente
+```bash
+pip install --upgrade -r requirements.txt
+```
+#### 2. Instalar desde conda
+```bash
+conda create -n chattts
+conda activate chattts
+pip install -r requirements.txt
+```
+### Inicio Rápido
+#### 1. Iniciar la interfaz de usuario web (WebUI)
+```bash
+python examples/web/webui.py
+```
+#### 2. Inferir por línea de comando
+> Guardará el audio en `./output_audio_xxx.wav`
+```bash
+python examples/cmd/run.py "Please input your text."
+```
+### Básico
+```python
+import ChatTTS
+from IPython.display import Audio
+import torchaudio
+import torch
+chat = ChatTTS.Chat()
+chat.load(compile=False) # Set to True for better performance
+texts = ["PUT YOUR TEXT HERE",]
+wavs = chat.infer(texts)
+torchaudio.save("output1.wav", torch.from_numpy(wavs[0]), 24000)
+```
+### Avanzado
+```python
+###################################
+# Sample a speaker from Gaussian.
+rand_spk = chat.sample_random_speaker()
+print(rand_spk) # save it for later timbre recovery
+params_infer_code = ChatTTS.Chat.InferCodeParams(
+    spk_emb = rand_spk, # add sampled speaker
+    temperature = .3,   # using custom temperature
+    top_P = 0.7,        # top P decode
+    top_K = 20,         # top K decode
+)
+###################################
+# For sentence level manual control.
+# use oral_(0-9), laugh_(0-2), break_(0-7)
+# to generate special token in text to synthesize.
+params_refine_text = ChatTTS.Chat.RefineTextParams(
+    prompt='[oral_2][laugh_0][break_6]',
+)
+wavs = chat.infer(
+    texts,
+    params_refine_text=params_refine_text,
+    params_infer_code=params_infer_code,
+)
+###################################
+# For word level manual control.
+text = 'What is [uv_break]your favorite english food?[laugh][lbreak]'
+wavs = chat.infer(text, skip_refine_text=True, params_refine_text=params_refine_text,  params_infer_code=params_infer_code)
+torchaudio.save("output2.wav", torch.from_numpy(wavs[0]), 24000)
+```
+<details open>
+  <summary><h4>Ejemplo: auto presentación</h4></summary>
+```python
+inputs_en = """
+chat T T S is a text to speech model designed for dialogue applications.
+[uv_break]it supports mixed language input [uv_break]and offers multi speaker
+capabilities with precise control over prosodic elements [laugh]like like
+[uv_break]laughter[laugh], [uv_break]pauses, [uv_break]and intonation.
+[uv_break]it delivers natural and expressive speech,[uv_break]so please
+[uv_break] use the project responsibly at your own risk.[uv_break]
+""".replace('\n', '') # English is still experimental.
+params_refine_text = ChatTTS.Chat.RefineTextParams(
+    prompt='[oral_2][laugh_0][break_4]',
+)
+audio_array_en = chat.infer(inputs_en, params_refine_text=params_refine_text)
+torchaudio.save("output3.wav", torch.from_numpy(audio_array_en[0]), 24000)
+```
+<table>
+<tr>
+<td align="center">
+**altavoz masculino**
+</td>
+<td align="center">
+**altavoz femenino**
+</td>
+</tr>
+<tr>
+<td align="center">
+[male speaker](https://github.com/2noise/ChatTTS/assets/130631963/e0f51251-db7f-4d39-a0e9-3e095bb65de1)
+</td>
+<td align="center">
+[female speaker](https://github.com/2noise/ChatTTS/assets/130631963/f5dcdd01-1091-47c5-8241-c4f6aaaa8bbd)
+</td>
+</tr>
+</table>
+</details>
+## Preguntas y Respuestas
+#### 1. ¿Cuánta memoria gráfica de acceso aleatorio necesito? ¿Qué tal inferir la velocidad?
+Para un clip de audio de 30 segundos, se requieren al menos 4 GB de memoria de GPU. Para la GPU 4090, puede generar audio correspondiente a aproximadamente 7 tokens semánticos por segundo. El Factor en Tiempo Real (RTF) es aproximadamente 0,3.
+#### 2. La estabilidad del modelo no es lo suficientemente buena y existen problemas como varios altavoces o mala calidad del sonido.
+Este es un problema común en los modelos autorregresivos (para bark y valle). Generalmente es difícil de evitar. Puede probar varias muestras para encontrar resultados adecuados.
+#### 3. ¿Podemos controlar algo más que la risa? ¿Podemos controlar otras emociones?
+En el modelo lanzado actualmente, las únicas unidades de control a nivel de token son `[risa]`, `[uv_break]` y `[lbreak]`. En una versión futura, es posible que abramos el código fuente del modelo con capacidades adicionales de control de emociones.
+## Agradecimientos
+- [bark](https://github.com/suno-ai/bark), [XTTSv2](https://github.com/coqui-ai/TTS) y [valle](https://arxiv.org/abs/2301.02111) demuestran un resultado TTS notable mediante un sistema de estilo autorregresivo.
+- [fish-speech](https://github.com/fishaudio/fish-speech) revela las capacidades de GVQ como tokenizador de audio para el modelado LLM.
+- [vocos](https://github.com/gemelo-ai/vocos) se utiliza como codificador de voz previamente entrenado.
+## Agradecimiento Especial
+- [wlu-audio lab](https://audio.westlake.edu.cn/) para experimentos iniciales del algoritmo.
+## Recursos Relacionados
+- [Awesome-ChatTTS](https://github.com/libukai/Awesome-ChatTTS)
+## Gracias a todos los contribuyentes por sus esfuerzos.
+[![contributors](https://contrib.rocks/image?repo=2noise/ChatTTS)](https://github.com/2noise/ChatTTS/graphs/contributors)
+<div align="center">
+  ![counter](https://counter.seku.su/cmoe?name=chattts&theme=mbs)
+</div>

docs/fr/README.md ADDED Viewed

	@@ -0,0 +1,283 @@

+<div align="center">
+<a href="https://trendshift.io/repositories/10489" target="_blank"><img src="https://trendshift.io/api/badge/repositories/10489" alt="2noise%2FChatTTS | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+# ChatTTS
+Un modèle de parole génératif pour le dialogue quotidien.
+[![Licence](https://img.shields.io/github/license/2noise/ChatTTS?style=for-the-badge)](https://github.com/2noise/ChatTTS/blob/main/LICENSE)
+[![PyPI](https://img.shields.io/pypi/v/ChatTTS.svg?style=for-the-badge&color=green)](https://pypi.org/project/ChatTTS)
+[![Huggingface](https://img.shields.io/badge/🤗%20-Models-yellow.svg?style=for-the-badge)](https://huggingface.co/2Noise/ChatTTS)
+[![Open In Colab](https://img.shields.io/badge/Colab-F9AB00?style=for-the-badge&logo=googlecolab&color=525252)](https://colab.research.google.com/github/2noise/ChatTTS/blob/main/examples/ipynb/colab.ipynb)
+[![Discord](https://img.shields.io/badge/Discord-7289DA?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/Ud5Jxgx5yD)
+[**English**](../../README.md) | [**简体中文**](../cn/README.md) | [**日本語**](../jp/README.md) | [**Русский**](../ru/README.md) | [**Español**](../es/README.md)| **Français**
+</div>
+## Introduction
+> [!Note]
+> Ce dépôt contient l'infrastructure de l'algorithme et quelques exemples simples.
+> [!Tip]
+> Pour les produits finaux étendus pour les utilisateurs, veuillez consulter le dépôt index [Awesome-ChatTTS](https://github.com/libukai/Awesome-ChatTTS/tree/en) maintenu par la communauté.
+ChatTTS est un modèle de synthèse vocale conçu spécifiquement pour les scénarios de dialogue tels que les assistants LLM.
+### Langues prises en charge
+- [x] Anglais
+- [x] Chinois
+- [ ] À venir...
+### Points forts
+> Vous pouvez vous référer à **[cette vidéo sur Bilibili](https://www.bilibili.com/video/BV1zn4y1o7iV)** pour une description détaillée.
+1. **Synthèse vocale conversationnelle**: ChatTTS est optimisé pour les tâches basées sur le dialogue, permettant une synthèse vocale naturelle et expressive. Il prend en charge plusieurs locuteurs, facilitant les conversations interactives.
+2. **Contrôle granulaire**: Le modèle peut prédire et contrôler des caractéristiques prosodiques fines, y compris le rire, les pauses et les interjections.
+3. **Meilleure prosodie**: ChatTTS dépasse la plupart des modèles TTS open-source en termes de prosodie. Nous fournissons des modèles pré-entraînés pour soutenir la recherche et le développement.
+### Dataset & Modèle
+- Le modèle principal est entraîné avec des données audio en chinois et en anglais de plus de 100 000 heures.
+- La version open-source sur **[HuggingFace](https://huggingface.co/2Noise/ChatTTS)** est un modèle pré-entraîné de 40 000 heures sans SFT.
+### Roadmap
+- [x] Open-source du modèle de base de 40k heures et du fichier spk_stats.
+- [x] Génération audio en streaming.
+- [ ] Open-source de la version 40k heures avec contrôle multi-émotions.
+- [ ] ChatTTS.cpp (nouveau dépôt dans l'organisation `2noise` est bienvenu)
+### Avertissement
+> [!Important]
+> Ce dépôt est à des fins académiques uniquement.
+Il est destiné à un usage éducatif et de recherche, et ne doit pas être utilisé à des fins commerciales ou légales. Les auteurs ne garantissent pas l'exactitude, l'exhaustivité ou la fiabilité des informations. Les informations et les données utilisées dans ce dépôt sont à des fins académiques et de recherche uniquement. Les données obtenues à partir de sources accessibles au public, et les auteurs ne revendiquent aucun droit de propriété ou de copyright sur les données.
+ChatTTS est un système de synthèse vocale puissant. Cependant, il est très important d'utiliser cette technologie de manière responsable et éthique. Pour limiter l'utilisation de ChatTTS, nous avons ajouté une petite quantité de bruit haute fréquence pendant l'entraînement du modèle de 40 000 heures et compressé la qualité audio autant que possible en utilisant le format MP3, pour empêcher les acteurs malveillants de l'utiliser potentiellement à des fins criminelles. En même temps, nous avons entraîné en interne un modèle de détection et prévoyons de l'open-source à l'avenir.
+### Contact
+> Les issues/PRs sur GitHub sont toujours les bienvenus.
+#### Demandes formelles
+Pour les demandes formelles concernant le modèle et la feuille de route, veuillez nous contacter à **[email protected]**.
+#### Discussion en ligne
+##### 1. Groupe QQ (application sociale chinoise)
+- **Groupe 1**, 808364215 (Complet)
+- **Groupe 2**, 230696694 (Complet)
+- **Groupe 3**, 933639842 (Complet)
+- **Groupe 4**, 608667975
+##### 2. Serveur Discord
+Rejoignez en cliquant [ici](https://discord.gg/Ud5Jxgx5yD).
+## Pour commencer
+### Cloner le dépôt
+```bash
+git clone https://github.com/2noise/ChatTTS
+cd ChatTTS
+```
+### Installer les dépendances
+#### 1. Installation directe
+```bash
+pip install --upgrade -r requirements.txt
+```
+#### 2. Installer depuis conda
+```bash
+conda create -n chattts
+conda activate chattts
+pip install -r requirements.txt
+```
+#### Optionnel : Installer TransformerEngine si vous utilisez un GPU NVIDIA (Linux uniquement)
+> [!Note]
+> Le processus d'installation est très lent.
+> [!Warning]
+> L'adaptation de TransformerEngine est actuellement en cours de développement et NE PEUT PAS fonctionner correctement pour le moment.
+> Installez-le uniquement à des fins de développement.
+```bash
+pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
+```
+#### Optionnel : Installer FlashAttention-2 (principalement GPU NVIDIA)
+> [!Note]
+> Voir les appareils pris en charge dans la [documentation Hugging Face](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2).
+> [!Warning]
+> Actuellement, FlashAttention-2 ralentira la vitesse de génération selon [ce problème](https://github.com/huggingface/transformers/issues/26990).
+> Installez-le uniquement à des fins de développement.
+```bash
+pip install flash-attn --no-build-isolation
+```
+### Démarrage rapide
+> Assurez-vous que vous êtes dans le répertoire racine du projet lorsque vous exécutez ces commandes ci-dessous.
+#### 1. Lancer WebUI
+```bash
+python examples/web/webui.py
+```
+#### 2. Inférence par ligne de commande
+> Cela enregistrera l'audio sous ‘./output_audio_n.mp3’
+```bash
+python examples/cmd/run.py "Votre premier texte." "Votre deuxième texte."
+```
+## Installation
+1. Installer la version stable depuis PyPI
+```bash
+pip install ChatTTS
+```
+2. Installer la dernière version depuis GitHub
+```bash
+pip install git+https://github.com/2noise/ChatTTS
+```
+3. Installer depuis le répertoire local en mode développement
+```bash
+pip install -e .
+```
+### Utilisation de base
+```python
+import ChatTTS
+import torch
+import torchaudio
+chat = ChatTTS.Chat()
+chat.load(compile=False) # Définissez sur True pour de meilleures performances
+texts = ["METTEZ VOTRE PREMIER TEXTE ICI", "METTEZ VOTRE DEUXIÈME TEXTE ICI"]
+wavs = chat.infer(texts)
+torchaudio.save("output1.wav", torch.from_numpy(wavs[0]), 24000)
+```
+### Utilisation avancée
+```python
+###################################
+# Échantillonner un locuteur à partir d'une distribution gaussienne.
+rand_spk = chat.sample_random_speaker()
+print(rand_spk) # sauvegardez-le pour une récupération ultérieure du timbre
+params_infer_code = ChatTTS.Chat.InferCodeParams(
+    spk_emb = rand_spk, # ajouter le locuteur échantillonné
+    temperature = .3,   # en utilisant une température personnalisée
+    top_P = 0.7,        # top P décode
+    top_K = 20,         # top K décode
+)
+###################################
+# Pour le contrôle manuel au niveau des phrases.
+# utilisez oral_(0-9), laugh_(0-2), break_(0-7)
+# pour générer un token spécial dans le texte à synthétiser.
+params_refine_text = ChatTTS.Chat.RefineTextParams(
+    prompt='[oral_2][laugh_0][break_6]',
+)
+wavs = chat.infer(
+    texts,
+    params_refine_text=params_refine_text,
+    params_infer_code=params_infer_code,
+)
+###################################
+# Pour le contrôle manuel au niveau des mots.
+text = 'Quel est [uv_break]votre plat anglais préféré?[laugh][lbreak]'
+wavs = chat.infer(text, skip_refine_text=True, params_refine_text=params_refine_text,  params_infer_code=params_infer_code)
+torchaudio.save("output2.wav", torch.from_numpy(wavs[0]), 24000)
+```
+<details open>
+  <summary><h4>Exemple : auto-présentation</h4></summary>
+```python
+inputs_en = """
+chat T T S est un modèle de synthèse vocale conçu pour les applications de dialogue.
+[uv_break]il prend en charge les entrées en langues mixtes [uv_break]et offre des capacités multi-locuteurs
+avec un contrôle précis des éléments prosodiques comme
+[uv_break]le rire[uv_break][laugh], [uv_break]les pauses, [uv_break]et l'intonation.
+[uv_break]il délivre une parole naturelle et expressive,[uv_break]donc veuillez
+[uv_break]utiliser le projet de manière responsable à vos risques et périls.[uv_break]
+""".replace('\n', '') # L'anglais est encore expérimental.
+params_refine_text = ChatTTS.Chat.RefineTextParams(
+    prompt='[oral_2][laugh_0][break_4]',
+)
+audio_array_en = chat.infer(inputs_en, params_refine_text=params_refine_text)
+torchaudio.save("output3.wav", torch.from_numpy(audio_array_en[0]), 24000)
+```
+<table>
+<tr>
+<td align="center">
+**locuteur masculin**
+</td>
+<td align="center">
+**locutrice féminine**
+</td>
+</tr>
+<tr>
+<td align="center">
+[locuteur masculin](https://github.com/2noise/ChatTTS/assets/130631963/e0f51251-db7f-4d39-a0e9-3e095bb65de1)
+</td>
+<td align="center">
+[locutrice féminine](https://github.com/2noise/ChatTTS/assets/130631963/f5dcdd01-1091-47c5-8241-c4f6aaaa8bbd)
+</td>
+</tr>
+</table>
+</details>
+## FAQ
+#### 1. De combien de VRAM ai-je besoin ? Quelle est la vitesse d'inférence ?
+Pour un clip audio de 30 secondes, au moins 4 Go de mémoire GPU sont nécessaires. Pour le GPU 4090, il peut générer de l'audio correspondant à environ 7 tokens sémantiques par seconde. Le Facteur Temps Réel (RTF) est d'environ 0.3.
+#### 2. La stabilité du modèle n'est pas suffisante, avec des problèmes tels que des locuteurs multiples ou une mauvaise qualité audio.
+C'est un problème qui se produit généralement avec les modèles autoregressifs (pour bark et valle). Il est généralement difficile à éviter. On peut essayer plusieurs échantillons pour trouver un résultat approprié.
+#### 3. En plus du rire, pouvons-nous contrôler autre chose ? Pouvons-nous contrôler d'autres émotions ?
+Dans le modèle actuellement publié, les seules unités de contrôle au niveau des tokens sont `[laugh]`, `[uv_break]`, et `[lbreak]`. Dans les futures versions, nous pourrions open-source des modèles avec des capacités de contrôle émotionnel supplémentaires.
+## Remerciements
+- [bark](https://github.com/suno-ai/bark), [XTTSv2](https://github.com/coqui-ai/TTS) et [valle](https://arxiv.org/abs/2301.02111) démontrent un résultat TTS remarquable par un système de style autoregressif.
+- [fish-speech](https://github.com/fishaudio/fish-speech) révèle la capacité de GVQ en tant que tokenizer audio pour la modélisation LLM.
+- [vocos](https://github.com/gemelo-ai/vocos) qui est utilisé comme vocodeur pré-entraîné.
+## Appréciation spéciale
+- [wlu-audio lab](https://audio.westlake.edu.cn/) pour les expériences d'algorithme précoce.
+## Merci à tous les contributeurs pour leurs efforts
+[![contributors](https://contrib.rocks/image?repo=2noise/ChatTTS)](https://github.com/2noise/ChatTTS/graphs/contributors)
+<div align="center">
+  ![counter](https://counter.seku.su/cmoe?name=chattts&theme=mbs)
+</div>

docs/jp/README.md ADDED Viewed

	@@ -0,0 +1,134 @@

+# ChatTTS
+> [!NOTE]
+> 以下の内容は最新情報ではない可能性がありますのでご了承ください。全ての内容は英語版に基準することになります。
+[![Huggingface](https://img.shields.io/badge/🤗%20-Models-yellow.svg?style=for-the-badge)](https://huggingface.co/2Noise/ChatTTS)
+[**English**](../../README.md) | [**简体中文**](../cn/README.md) | **日本語** | [**Русский**](../ru/README.md) | [**Español**](../es/README.md) | [**Français**](../fr/README.md)
+ChatTTSは、LLMアシスタントなどの対話シナリオ用に特別に設計されたテキストから音声へのモデルです。英語と中国語の両方をサポートしています。私たちのモデルは、中国語と英語で構成される100,000時間以上でトレーニングされています。**[HuggingFace](https://huggingface.co/2Noise/ChatTTS)**でオープンソース化されているバージョンは、40,000時間の事前トレーニングモデルで、SFTは行われていません。
+モデルやロードマップについての正式なお問い合わせは、**[email protected]**までご連絡ください。QQグループ：808364215に参加してディスカッションすることもできます。GitHubでの問題提起も歓迎します。
+---
+## ハイライト
+1. **会話型TTS**: ChatTTSは対話ベースのタスクに最適化されており、自然で表現豊かな音声合成を実現します。複数の話者をサポートし、対話型の会話を容易にします。
+2. **細かい制御**: このモデルは、笑い、一時停止、間投詞などの細かい韻律特徴を予測および制御することができます。
+3. **より良い韻律**: ChatTTSは、韻律の面でほとんどのオープンソースTTSモデルを超えています。さらなる研究と開発をサポートするために、事前トレーニングされたモデルを提供しています。
+モデルの詳細な説明については、**[Bilibiliのビデオ](https://www.bilibili.com/video/BV1zn4y1o7iV)**を参照してください。
+---
+## 免責事項
+このリポジトリは学術目的のみのためです。教育および研究用途にのみ使用され、商業的または法的な目的には使用されません。著者は情報の正確性、完全性、または信頼性を保証しません。このリポジトリで使用される情報およびデータは、学術および研究目的のみのためのものです。データは公開されているソースから取得され、著者はデータに対する所有権または著作権を主張しません。
+ChatTTSは強力なテキストから音声へのシステムです。しかし、この技術を責任を持って、倫理的に利用することが非常に重要です。ChatTTSの使用を制限するために、40,000時間のモデルのトレーニング中に少量の高周波ノイズを追加し、MP3形式を使用して音質を可能な限り圧縮しました。これは、悪意のあるアクターが潜在的に犯罪目的で使用することを防ぐためです。同時に、私たちは内部的に検出モデルをトレーニングしており、将来的にオープンソース化する予定です。
+---
+## 使用方法
+<h4>基本的な使用方法</h4>
+```python
+import ChatTTS
+from IPython.display import Audio
+import torch
+chat = ChatTTS.Chat()
+chat.load(compile=False) # より良いパフォーマンスのためにTrueに設定
+texts = ["ここにテキストを入力してください",]
+wavs = chat.infer(texts, )
+torchaudio.save("output1.wav", torch.from_numpy(wavs[0]), 24000)
+```
+<h4>高度な使用方法</h4>
+```python
+###################################
+# ガウス分布から話者をサンプリングします。
+rand_spk = chat.sample_random_speaker()
+print(rand_spk) # save it for later timbre recovery
+params_infer_code = {
+  'spk_emb': rand_spk, # サンプリングされた話者を追加
+  'temperature': .3, # カスタム温度を使用
+  'top_P': 0.7, # トップPデコード
+  'top_K': 20, # トップKデコード
+}
+###################################
+# 文レベルの手動制御のために。
+# 特別なトークンを生成するためにテキストにoral_(0-9)、laugh_(0-2)、break_(0-7)を使用します。
+params_refine_text = {
+  'prompt': '[oral_2][laugh_0][break_6]'
+}
+wav = chat.infer(texts, params_refine_text=params_refine_text, params_infer_code=params_infer_code)
+###################################
+# 単語レベルの手動制御のために。
+text = 'あなたの好きな英語の食べ物は何ですか？[uv_break][laugh][lbreak]'
+wav = chat.infer(text, skip_refine_text=True, params_refine_text=params_refine_text,  params_infer_code=params_infer_code)
+torchaudio.save("output2.wav", torch.from_numpy(wavs[0]), 24000)
+```
+<details open>
+  <summary><h4>例：自己紹介</h4></summary>
+```python
+inputs_jp = """
+ChatTTSは、対話アプリケーション用に設計されたテキストから音声へのモデルです。
+[uv_break]混合言語入力をサポートし[uv_break]、韻律要素[laugh]の正確な制御を提供します
+[uv_break]笑い[laugh]、[uv_break]一時停止、[uv_break]およびイントネーション。[uv_break]自然で表現豊かな音声を提供します
+[uv_break]したがって、自己責任でプロジェクトを責任を持って使用してください。[uv_break]
+""".replace('\n', '') # 英語はまだ実験的です。
+params_refine_text = {
+  'prompt': '[oral_2][laugh_0][break_4]'
+}
+audio_array_jp = chat.infer(inputs_jp, params_refine_text=params_refine_text)
+torchaudio.save("output3.wav", torch.from_numpy(audio_array_jp[0]), 24000)
+```
+[男性話者](https://github.com/2noise/ChatTTS/assets/130631963/e0f51251-db7f-4d39-a0e9-3e095bb65de1)
+[女性話者](https://github.com/2noise/ChatTTS/assets/130631963/f5dcdd01-1091-47c5-8241-c4f6aaaa8bbd)
+</details>
+---
+## ロードマップ
+- [x] 40k時間のベースモデルとspk_statsファイルをオープンソース化
+- [ ] VQエンコーダーとLoraトレーニングコードをオープンソース化
+- [ ] テキストをリファインせずにストリーミングオーディオ生成*
+- [ ] 複数の感情制御を備えた40k時間バージョンをオープンソース化
+- [ ] ChatTTS.cppもしかしたら？（PRや新しいリポジトリが歓迎されます。）
+----
+## FAQ
+##### VRAMはどれくらい必要ですか？推論速度はどうですか？
+30秒のオーディオクリップには、少なくとも4GBのGPUメモリが必要です。4090 GPUの場合、約7つの意味トークンに対応するオーディオを1秒あたり生成できます。リアルタイムファクター（RTF）は約0.3です。
+##### モデルの安定性が十分でなく、複数の話者や音質が悪いという問題があります。
+これは、自己回帰モデル（barkおよびvalleの場合）で一般的に発生する問題です。一般的に避けるのは難しいです。複数のサンプルを試して、適切な結果を見つけることができます。
+##### 笑い以外に何か制御できますか？他の感情を制御できますか？
+現在リリースされているモデルでは、トークンレベルの制御ユニットは[laugh]、[uv_break]、および[lbreak]のみです。将来のバージョンでは、追加の感情制御機能を備えたモデルをオープンソース化する可能性があります。
+---
+## 謝辞
+- [bark](https://github.com/suno-ai/bark)、[XTTSv2](https://github.com/coqui-ai/TTS)、および[valle](https://arxiv.org/abs/2301.02111)は、自己回帰型システムによる顕著なTTS結果を示しました。
+- [fish-speech](https://github.com/fishaudio/fish-speech)は、LLMモデリングのためのオーディオトークナイザーとしてのGVQの能力を明らかにしました。
+- 事前トレーニングされたボコーダーとして使用される[vocos](https://github.com/gemelo-ai/vocos)。
+---
+## 特別感謝
+- 初期のアルゴリズム実験をサポートしてくれた[wlu-audio lab](https://audio.westlake.edu.cn/)。

docs/ru/README.md ADDED Viewed

	@@ -0,0 +1,136 @@

+# ChatTTS
+> [!NOTE]
+> Следующая информация может быть не самой последней, пожалуйста, смотрите английскую версию для актуальных данных.
+[![Huggingface](https://img.shields.io/badge/🤗%20-Models-yellow.svg?style=for-the-badge)](https://huggingface.co/2Noise/ChatTTS)
+[**English**](../../README.md) | [**简体中文**](../cn/README.md) | [**日本語**](../jp/README.md) | **Русский** | [**Español**](../es/README.md) | [**Français**](../fr/README.md)
+ChatTTS - это модель преобразования текста в речь, специально разработанная для диалоговых сценариев, таких как помощник LLM. Она поддерживает как английский, так и китайский языки. Наша модель обучена на более чем 100 000 часах английского и китайского языков. Открытая версия на **[HuggingFace](https://huggingface.co/2Noise/ChatTTS)** - это предварительно обученная модель с 40 000 часами без SFT.
+Для официальных запросов о модели и плане развития, пожалуйста, свяжитесь с нами по адресу **[email protected]**. Вы можете присоединиться к нашей группе QQ: 808364215 для обсуждения. Добавление вопросов на GitHub также приветствуется.
+---
+## Особенности
+1. **Диалоговый TTS**: ChatTTS оптимизирован для задач, основанных на диалогах, что позволяет создавать натуральную и выразительную речь. Он поддерживает несколько говорящих, облегчая интерактивные беседы.
+2. **Тонкий контроль**: Модель может предсказывать и контролировать тонкие просодические особенности, включая смех, паузы и вставные слова.
+3. **Лучшая просодия**: ChatTTS превосходит большинство открытых моделей TTS с точки зрения просодии. Мы предоставляем предварительно обученные модели для поддержки дальнейших исследований и разработок.
+Для подробного описания модели вы можете обратиться к **[видео на Bilibili](https://www.bilibili.com/video/BV1zn4y1o7iV)**
+---
+## Отказ от ответственности
+Этот репозиторий предназначен только для академических целей. Он предназначен для образовательного и исследовательского использования и не должен использоваться в коммерческих или юридических целях. Авторы не гарантируют точность, полноту или надежность информации. Информация и данные, использованные в этом репозитории, предназначены только для академических и исследовательских целей. Данные получены из общедоступных источников, и авторы не заявляют о каких-либо правах собственности или авторских правах на данные.
+ChatTTS - мощная система преобразования текста в речь. Однако очень важно использовать эту технологию ответственно и этично. Чтобы ограничить использование ChatTTS, мы добавили небольшое количество высокочастотного шума во время обучения модели на 40 000 часов и сжали качество аудио как можно больше с помощью формата MP3, чтобы предотвратить возможное использование злоумышленниками в преступных целях. В то же время мы внутренне обучили модель обнаружения и планируем открыть ее в будущем.
+---
+## Использование
+<h4>Базовое использование</h4>
+```python
+import ChatTTS
+from IPython.display import Audio
+import torch
+chat = ChatTTS.Chat()
+chat.load(compile=False) # Установите значение True для лучшей производительности
+texts = ["ВВЕДИ��Е ВАШ ТЕКСТ ЗДЕСЬ",]
+wavs = chat.infer(texts)
+torchaudio.save("output1.wav", torch.from_numpy(wavs[0]), 24000)
+```
+<h4>Продвинутое использование</h4>
+```python
+###################################
+# Выборка говорящего из Гауссиана.
+rand_spk = chat.sample_random_speaker()
+print(rand_spk) # save it for later timbre recovery
+params_infer_code = {
+  'spk_emb': rand_spk, # добавить выбранного говорящего
+  'temperature': .3, # использовать пользовательскую температуру
+  'top_P': 0.7, # декодирование top P
+  'top_K': 20, # декодирование top K
+}
+###################################
+# Для контроля на уровне предложений.
+# используйте oral_(0-9), laugh_(0-2), break_(0-7)
+# для генерации специального токена в тексте для синтеза.
+params_refine_text = {
+  'prompt': '[oral_2][laugh_0][break_6]'
+}
+wav = chat.infer(texts, params_refine_text=params_refine_text, params_infer_code=params_infer_code)
+###################################
+# Для контроля на уровне слов.
+text = 'Какая ваша любимая английская еда?[uv_break]your favorite english food?[laugh][lbreak]'
+wav = chat.infer(text, skip_refine_text=True, params_refine_text=params_refine_text,  params_infer_code=params_infer_code)
+torchaudio.save("output2.wav", torch.from_numpy(wavs[0]), 24000)
+```
+<details open>
+  <summary><h4>Пример: самопрезентация</h4></summary>
+```python
+inputs_ru = """
+ChatTTS - это модель преобразования текста в речь, разработанная для диалоговых приложений.
+[uv_break]Она поддерживает смешанный языковой ввод [uv_break]и предлагает возможности множественных говорящих
+с точным контролем над просодическими элементами [laugh]как [uv_break]смех[laugh], [uv_break]паузы, [uv_break]и интонацию.
+[uv_break]Она обеспечивает натуральную и выразительную речь,[uv_break]поэтому, пожалуйста,
+[uv_break] используйте проект ответственно и на свой страх и риск.[uv_break]
+""".replace('\n', '') # Русский язык все еще находится в экспериментальной стадии.
+params_refine_text = {
+  'prompt': '[oral_2][laugh_0][break_4]'
+}
+audio_array_ru = chat.infer(inputs_ru, params_refine_text=params_refine_text)
+torchaudio.save("output3.wav", torch.from_numpy(audio_array_ru[0]), 24000)
+```
+[мужской говорящий](https://github.com/2noise/ChatTTS/assets/130631963/e0f51251-db7f-4d39-a0e9-3e095bb65de1)
+[женский говорящий](https://github.com/2noise/ChatTTS/assets/130631963/f5dcdd01-1091-47c5-8241-c4f6aaaa8bbd)
+</details>
+---
+## План развития
+- [x] Открыть исходный код базовой модели на 40 тысяч часов и файла spk_stats
+- [ ] Открыть исходный код кодировщика VQ и кода обучения Lora
+- [ ] Потоковая генерация аудио без уточнения текста*
+- [ ] Открыть исходный код версии на 40 тысяч часов с управлением множественными эмоциями
+- [ ] ChatTTS.cpp возможно? (PR или новый репозиторий приветствуются.)
+----
+## Часто задаваемые вопросы
+##### Сколько VRAM мне нужно? Как насчет скорости инференса?
+Для 30-секундного аудиоклипа требуется как минимум 4 ГБ памяти GPU. Для GPU 4090, он может генерировать аудио, соответствующее примерно 7 семантическим токенам в секунду. Фактор реального времени (RTF) составляет около 0.3.
+##### Стабильность модели кажется недостаточно хорошей, возникают проблемы с множественными говорящими или плохим качеством аудио.
+Это проблема, которая обычно возникает с авторегрессивными моделями (для bark и valle). Это обычно трудно избежать. Можно попробовать несколько образцов, чтобы найти подходящий результат.
+##### Помимо смеха, можем ли мы контролировать что-то еще? Можем ли мы контролировать другие эмоции?
+В текущей выпущенной модели единственными элементами управления на уровне токенов являются [laugh], [uv_break] и [lbreak]. В будущих версиях мы можем открыть модели с дополнительными возможностями контроля эмоций.
+---
+## Благодарности
+- [bark](https://github.com/suno-ai/bark), [XTTSv2](https://github.com/coqui-ai/TTS) и [valle](https://arxiv.org/abs/2301.02111) демонстрируют замечательный результат TTS с помощью системы авторегрессивного стиля.
+- [fish-speech](https://github.com/fishaudio/fish-speech) показывает возможности GVQ как аудио токенизатора для моделирования LLM.
+- [vocos](https://github.com/gemelo-ai/vocos), который используется в качестве предварительно обученного вокодера.
+---
+## Особая благодарность
+- [wlu-audio lab](https://audio.westlake.edu.cn/) за ранние эксперименты с алгоритмами.

examples/__init__.py ADDED Viewed

File without changes

examples/api/README.md ADDED Viewed

	@@ -0,0 +1,23 @@

+# Generating voice with ChatTTS via API
+## Install requirements
+Install `FastAPI` and `requests`:
+```
+pip install -r examples/api/requirements.txt
+```
+## Run API server
+```
+fastapi dev examples/api/main.py --host 0.0.0.0 --port 8000
+```
+## Generate audio using requests
+```
+python examples/api/client.py
+```
+mp3 audio files will be saved to the `output` directory.

examples/api/client.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import datetime
+import os
+import zipfile
+from io import BytesIO
+import requests
+chattts_service_host = os.environ.get("CHATTTS_SERVICE_HOST", "localhost")
+chattts_service_port = os.environ.get("CHATTTS_SERVICE_PORT", "8000")
+CHATTTS_URL = f"http://{chattts_service_host}:{chattts_service_port}/generate_voice"
+# main infer params
+body = {
+    "text": [
+        "四川美食确实以辣闻名，但也有不辣的选择。",
+        "比如甜水面、赖汤圆、蛋烘糕、叶儿粑等，这些小吃口味温和，甜而不腻，也很受欢迎。",
+    ],
+    "stream": False,
+    "lang": None,
+    "skip_refine_text": True,
+    "refine_text_only": False,
+    "use_decoder": True,
+    "audio_seed": 12345678,
+    "text_seed": 87654321,
+    "do_text_normalization": True,
+    "do_homophone_replacement": False,
+}
+# refine text params
+params_refine_text = {
+    "prompt": "",
+    "top_P": 0.7,
+    "top_K": 20,
+    "temperature": 0.7,
+    "repetition_penalty": 1,
+    "max_new_token": 384,
+    "min_new_token": 0,
+    "show_tqdm": True,
+    "ensure_non_empty": True,
+    "stream_batch": 24,
+}
+body["params_refine_text"] = params_refine_text
+# infer code params
+params_infer_code = {
+    "prompt": "[speed_5]",
+    "top_P": 0.1,
+    "top_K": 20,
+    "temperature": 0.3,
+    "repetition_penalty": 1.05,
+    "max_new_token": 2048,
+    "min_new_token": 0,
+    "show_tqdm": True,
+    "ensure_non_empty": True,
+    "stream_batch": True,
+    "spk_emb": None,
+}
+body["params_infer_code"] = params_infer_code
+try:
+    response = requests.post(CHATTTS_URL, json=body)
+    response.raise_for_status()
+    with zipfile.ZipFile(BytesIO(response.content), "r") as zip_ref:
+        # save files for each request in a different folder
+        dt = datetime.datetime.now()
+        ts = int(dt.timestamp())
+        tgt = f"./output/{ts}/"
+        os.makedirs(tgt, 0o755)
+        zip_ref.extractall(tgt)
+        print("Extracted files into", tgt)
+except requests.exceptions.RequestException as e:
+    print(f"Request Error: {e}")

examples/api/main.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import io
+import os
+import sys
+import zipfile
+from fastapi import FastAPI
+from fastapi.responses import StreamingResponse
+if sys.platform == "darwin":
+    os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+from typing import Optional
+import ChatTTS
+from tools.audio import pcm_arr_to_mp3_view
+from tools.logger import get_logger
+import torch
+from pydantic import BaseModel
+logger = get_logger("Command")
+app = FastAPI()
+@app.on_event("startup")
+async def startup_event():
+    global chat
+    chat = ChatTTS.Chat(get_logger("ChatTTS"))
+    logger.info("Initializing ChatTTS...")
+    if chat.load():
+        logger.info("Models loaded successfully.")
+    else:
+        logger.error("Models load failed.")
+        sys.exit(1)
+class ChatTTSParams(BaseModel):
+    text: list[str]
+    stream: bool = False
+    lang: Optional[str] = None
+    skip_refine_text: bool = False
+    refine_text_only: bool = False
+    use_decoder: bool = True
+    do_text_normalization: bool = True
+    do_homophone_replacement: bool = False
+    params_refine_text: ChatTTS.Chat.RefineTextParams
+    params_infer_code: ChatTTS.Chat.InferCodeParams
+@app.post("/generate_voice")
+async def generate_voice(params: ChatTTSParams):
+    logger.info("Text input: %s", str(params.text))
+    # audio seed
+    if params.params_infer_code.manual_seed is not None:
+        torch.manual_seed(params.params_infer_code.manual_seed)
+        params.params_infer_code.spk_emb = chat.sample_random_speaker()
+    # text seed for text refining
+    if params.params_refine_text:
+        text = chat.infer(
+            text=params.text, skip_refine_text=False, refine_text_only=True
+        )
+        logger.info(f"Refined text: {text}")
+    else:
+        # no text refining
+        text = params.text
+    logger.info("Use speaker:")
+    logger.info(params.params_infer_code.spk_emb)
+    logger.info("Start voice inference.")
+    wavs = chat.infer(
+        text=text,
+        stream=params.stream,
+        lang=params.lang,
+        skip_refine_text=params.skip_refine_text,
+        use_decoder=params.use_decoder,
+        do_text_normalization=params.do_text_normalization,
+        do_homophone_replacement=params.do_homophone_replacement,
+        params_infer_code=params.params_infer_code,
+        params_refine_text=params.params_refine_text,
+    )
+    logger.info("Inference completed.")
+    # zip all of the audio files together
+    buf = io.BytesIO()
+    with zipfile.ZipFile(
+        buf, "a", compression=zipfile.ZIP_DEFLATED, allowZip64=False
+    ) as f:
+        for idx, wav in enumerate(wavs):
+            f.writestr(f"{idx}.mp3", pcm_arr_to_mp3_view(wav))
+    logger.info("Audio generation successful.")
+    buf.seek(0)
+    response = StreamingResponse(buf, media_type="application/zip")
+    response.headers["Content-Disposition"] = "attachment; filename=audio_files.zip"
+    return response

examples/api/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ fastapi
2	+ requests

examples/cmd/run.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import os, sys
+if sys.platform == "darwin":
+    os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+now_dir = os.getcwd()
+sys.path.append(now_dir)
+from typing import Optional, List
+import argparse
+import numpy as np
+import ChatTTS
+from tools.logger import get_logger
+from tools.audio import pcm_arr_to_mp3_view
+from tools.normalizer.en import normalizer_en_nemo_text
+from tools.normalizer.zh import normalizer_zh_tn
+logger = get_logger("Command")
+def save_mp3_file(wav, index):
+    data = pcm_arr_to_mp3_view(wav)
+    mp3_filename = f"output_audio_{index}.mp3"
+    with open(mp3_filename, "wb") as f:
+        f.write(data)
+    logger.info(f"Audio saved to {mp3_filename}")
+def load_normalizer(chat: ChatTTS.Chat):
+    # try to load normalizer
+    try:
+        chat.normalizer.register("en", normalizer_en_nemo_text())
+    except ValueError as e:
+        logger.error(e)
+    except BaseException:
+        logger.warning("Package nemo_text_processing not found!")
+        logger.warning(
+            "Run: conda install -c conda-forge pynini=2.1.5 && pip install nemo_text_processing",
+        )
+    try:
+        chat.normalizer.register("zh", normalizer_zh_tn())
+    except ValueError as e:
+        logger.error(e)
+    except BaseException:
+        logger.warning("Package WeTextProcessing not found!")
+        logger.warning(
+            "Run: conda install -c conda-forge pynini=2.1.5 && pip install WeTextProcessing",
+        )
+def main(
+    texts: List[str],
+    spk: Optional[str] = None,
+    stream: bool = False,
+    source: str = "local",
+    custom_path: str = "",
+):
+    logger.info("Text input: %s", str(texts))
+    chat = ChatTTS.Chat(get_logger("ChatTTS"))
+    logger.info("Initializing ChatTTS...")
+    load_normalizer(chat)
+    is_load = False
+    if os.path.isdir(custom_path) and source == "custom":
+        is_load = chat.load(source="custom", custom_path=custom_path)
+    else:
+        is_load = chat.load(source=source)
+    if is_load:
+        logger.info("Models loaded successfully.")
+    else:
+        logger.error("Models load failed.")
+        sys.exit(1)
+    if spk is None:
+        spk = chat.sample_random_speaker()
+    logger.info("Use speaker:")
+    print(spk)
+    logger.info("Start inference.")
+    wavs = chat.infer(
+        texts,
+        stream,
+        params_infer_code=ChatTTS.Chat.InferCodeParams(
+            spk_emb=spk,
+        ),
+    )
+    logger.info("Inference completed.")
+    # Save each generated wav file to a local file
+    if stream:
+        wavs_list = []
+    for index, wav in enumerate(wavs):
+        if stream:
+            for i, w in enumerate(wav):
+                save_mp3_file(w, (i + 1) * 1000 + index)
+            wavs_list.append(wav)
+        else:
+            save_mp3_file(wav, index)
+    if stream:
+        for index, wav in enumerate(np.concatenate(wavs_list, axis=1)):
+            save_mp3_file(wav, index)
+    logger.info("Audio generation successful.")
+if __name__ == "__main__":
+    r"""
+    python -m examples.cmd.run \
+        --source custom --custom_path ../../models/2Noise/ChatTTS 你好喲 ":)"
+    """
+    logger.info("Starting ChatTTS commandline demo...")
+    parser = argparse.ArgumentParser(
+        description="ChatTTS Command",
+        usage='[--spk xxx] [--stream] [--source ***] [--custom_path XXX] "Your text 1." " Your text 2."',
+    )
+    parser.add_argument(
+        "--spk",
+        help="Speaker (empty to sample a random one)",
+        type=Optional[str],
+        default=None,
+    )
+    parser.add_argument(
+        "--stream",
+        help="Use stream mode",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--source",
+        help="source form [ huggingface(hf download), local(ckpt save to asset dir), custom(define) ]",
+        type=str,
+        default="local",
+    )
+    parser.add_argument(
+        "--custom_path",
+        help="custom defined model path(include asset ckpt dir)",
+        type=str,
+        default="",
+    )
+    parser.add_argument(
+        "texts",
+        help="Original text",
+        default=["YOUR TEXT HERE"],
+        nargs=argparse.REMAINDER,
+    )
+    args = parser.parse_args()
+    logger.info(args)
+    main(args.texts, args.spk, args.stream, args.source, args.custom_path)
+    logger.info("ChatTTS process finished.")