Spaces:

fishaudio
/

fish-speech-1

Running on A10G

App Files Files Community

Update app.py

by PoTaTo721 - opened Jul 30

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+414

-130

Files changed (1) hide show

app.py +414 -130

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import hydra
 # Download if not exists
 os.makedirs("checkpoints", exist_ok=True)
-snapshot_download(repo_id="fishaudio/fish-speech-1", local_dir="./checkpoints/fish-speech-1")
 print("All checkpoints downloaded")
@@ -30,8 +30,8 @@ os.environ["EINX_FILTER_TRACEBACK"] = "false"
 HEADER_MD = """# Fish Speech
-## The demo in this space is version 1.0, Please check [Fish Audio](https://fish.audio) for the best model.
-## 该 Demo 为 Fish Speech 1.0 版本, 请在 [Fish Audio](https://fish.audio) 体验最新 DEMO.
 A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).
 由 [Fish Audio](https://fish.audio) 研发的基于 VQ-GAN 和 Llama 的多语种语音合成.
@@ -39,14 +39,14 @@ A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https
 You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).
 你可以在 [这里](https://github.com/fishaudio/fish-speech) 找到源代码和 [这里](https://huggingface.co/fishaudio/fish-speech-1) 找到模型.
-Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.
-相关代码使用 BSD-3-Clause 许可证发布，权重使用 CC BY-NC-SA 4.0 许可证发布.
 We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.
 我们不对模型的任何滥用负责，请在使用之前考虑您当地的法律法规.
-The model running in this WebUI is Fish Speech V1 Medium SFT 4K.
-在此 WebUI 中运行的模型是 Fish Speech V1 Medium SFT 4K.
 """
 TEXTBOX_PLACEHOLDER = """Put your text here. 在此处输入文本."""
@@ -85,36 +85,27 @@ def inference(
     top_p,
     repetition_penalty,
     temperature,
-    speaker,
 ):
     if args.max_gradio_length > 0 and len(text) > args.max_gradio_length:
-        return None, f"Text is too long, please keep it under {args.max_gradio_length} characters."
-    # Parse reference audio aka prompt
-    prompt_tokens = None
-    if enable_reference_audio and reference_audio is not None:
-        # reference_audio_sr, reference_audio_content = reference_audio
-        reference_audio_content, _ = librosa.load(
-            reference_audio, sr=vqgan_model.sampling_rate, mono=True
-        )
-        audios = torch.from_numpy(reference_audio_content).to(vqgan_model.device)[
-            None, None, :
-        ]
-        logger.info(
-            f"Loaded audio with {audios.shape[2] / vqgan_model.sampling_rate:.2f} seconds"
         )
-        # VQ Encoder
-        audio_lengths = torch.tensor(
-            [audios.shape[2]], device=vqgan_model.device, dtype=torch.long
-        )
-        prompt_tokens = vqgan_model.encode(audios, audio_lengths)[0][0]
     # LLAMA Inference
     request = dict(
-        tokenizer=llama_tokenizer,
-        device=vqgan_model.device,
         max_new_tokens=max_new_tokens,
         text=text,
         top_p=top_p,
@@ -123,43 +114,246 @@ def inference(
         compile=args.compile,
         iterative_prompt=chunk_length > 0,
         chunk_length=chunk_length,
-        max_length=args.max_length,
-        speaker=speaker if speaker else None,
         prompt_tokens=prompt_tokens if enable_reference_audio else None,
         prompt_text=reference_text if enable_reference_audio else None,
     )
-    payload = dict(
-        response_queue=queue.Queue(),
-        request=request,
     )
-    llama_queue.put(payload)
-    codes = []
     while True:
-        result = payload["response_queue"].get()
-        if result == "next":
-            # TODO: handle next sentence
-            continue
-        if result == "done":
-            if payload["success"] is False:
-                return None, build_html_error_message(payload["response"])
             break
-        codes.append(result)
-    codes = torch.cat(codes, dim=1)
-    # VQGAN Inference
-    feature_lengths = torch.tensor([codes.shape[1]], device=vqgan_model.device)
-    fake_audios = vqgan_model.decode(
-        indices=codes[None], feature_lengths=feature_lengths, return_audios=True
-    )[0, 0]
-    fake_audios = fake_audios.float().cpu().numpy()
-    return (vqgan_model.sampling_rate, fake_audios), None
 def build_app():
@@ -170,95 +364,182 @@ def build_app():
         app.load(
             None,
             None,
-            js="() => {const params = new URLSearchParams(window.location.search);if (!params.has('__theme')) {params.set('__theme', 'light');window.location.search = params.toString();}}",
         )
         # Inference
         with gr.Row():
             with gr.Column(scale=3):
                 text = gr.Textbox(
-                    label="Input Text / 输入文本", placeholder=TEXTBOX_PLACEHOLDER, lines=15
                 )
                 with gr.Row():
-                    with gr.Tab(label="Advanced Config / 高级参数"):
                         chunk_length = gr.Slider(
-                            label="Iterative Prompt Length, 0 means off / 迭代提示长度，0 表示关闭",
                             minimum=0,
-                            maximum=100,
-                            value=30,
                             step=8,
                         )
                         max_new_tokens = gr.Slider(
-                            label="Maximum tokens per batch, 0 means no limit / 每批最大令牌数，0 表示无限制",
-                            minimum=128,
-                            maximum=512,
-                            value=512,  # 0 means no limit
                             step=8,
                         )
                         top_p = gr.Slider(
-                            label="Top-P", minimum=0, maximum=1, value=0.7, step=0.01
                         )
                         repetition_penalty = gr.Slider(
-                            label="Repetition Penalty",
-                            minimum=0,
-                            maximum=2,
-                            value=1.5,
                             step=0.01,
                         )
                         temperature = gr.Slider(
                             label="Temperature",
-                            minimum=0,
-                            maximum=2,
                             value=0.7,
                             step=0.01,
                         )
-                        speaker = gr.Textbox(
-                            label="Speaker / 说话人",
-                            placeholder="Type name of the speaker / 输入说话人的名称",
-                            lines=1,
-                        )
-                    with gr.Tab(label="Reference Audio / 参考音频"):
                         gr.Markdown(
-                            "5 to 10 seconds of reference audio, useful for specifying speaker. \n5 到 10 秒的参考音频，适用于指定音色。"
                         )
                         enable_reference_audio = gr.Checkbox(
-                            label="Enable Reference Audio / 启用参考音频",
                         )
                         reference_audio = gr.Audio(
-                            label="Reference Audio / 参考音频",
                             type="filepath",
                         )
-                        reference_text = gr.Textbox(
-                            label="Reference Text / 参考文本",
-                            placeholder="参考文本",
-                            lines=1,
                         )
             with gr.Column(scale=3):
-                with gr.Row():
-                    error = gr.HTML(label="Error Message / 错误信息")
-                with gr.Row():
-                    audio = gr.Audio(label="Generated Audio / 音频", type="numpy")
                 with gr.Row():
                     with gr.Column(scale=3):
                         generate = gr.Button(
-                            value="\U0001F3A7 Generate / 合成", variant="primary"
                         )
         # # Submit
         generate.click(
-            inference,
             [
-                text,
                 enable_reference_audio,
                 reference_audio,
                 reference_text,
@@ -267,12 +548,29 @@ def build_app():
                 top_p,
                 repetition_penalty,
                 temperature,
-                speaker,
             ],
-            [audio, error],
             concurrency_limit=1,
         )
     return app
@@ -281,74 +579,60 @@ def parse_args():
     parser.add_argument(
         "--llama-checkpoint-path",
         type=Path,
-        default="checkpoints/text2semantic-sft-large-v1-4k.pth",
     )
     parser.add_argument(
-        "--llama-config-name", type=str, default="dual_ar_2_codebook_large"
-    )
-    parser.add_argument(
-        "--vqgan-checkpoint-path",
         type=Path,
-        default="checkpoints/vq-gan-group-fsq-2x1024.pth",
     )
-    parser.add_argument("--vqgan-config-name", type=str, default="vqgan_pretrain")
-    parser.add_argument("--tokenizer", type=str, default="fishaudio/fish-speech-1")
     parser.add_argument("--device", type=str, default="cuda")
     parser.add_argument("--half", action="store_true")
-    parser.add_argument("--max-length", type=int, default=2048)
     parser.add_argument("--compile", action="store_true")
     parser.add_argument("--max-gradio-length", type=int, default=0)
     return parser.parse_args()
 if __name__ == "__main__":
     args = parse_args()
     args.precision = torch.half if args.half else torch.bfloat16
-    args.compile = True
-    args.max_gradio_length = 1024
-    args.tokenizer = "./checkpoints/fish-speech-1"
-    args.llama_checkpoint_path = "./checkpoints/fish-speech-1/text2semantic-sft-medium-v1-4k.pth"
-    args.llama_config_name = "dual_ar_2_codebook_medium"
-    args.vqgan_checkpoint_path = "./checkpoints/fish-speech-1/vq-gan-group-fsq-2x1024.pth"
-    args.vqgan_config_name = "vqgan_pretrain"
     logger.info("Loading Llama model...")
     llama_queue = launch_thread_safe_queue(
-        config_name=args.llama_config_name,
         checkpoint_path=args.llama_checkpoint_path,
         device=args.device,
         precision=args.precision,
-        max_length=args.max_length,
         compile=args.compile,
     )
-    llama_tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
     logger.info("Llama model loaded, loading VQ-GAN model...")
-    vqgan_model = load_vqgan_model(
-        config_name=args.vqgan_config_name,
-        checkpoint_path=args.vqgan_checkpoint_path,
         device=args.device,
     )
-    logger.info("VQ-GAN model loaded, warming up...")
     # Dry run to check if the model is loaded correctly and avoid the first-time latency
-    inference(
-        text="Hello, world!",
-        enable_reference_audio=False,
-        reference_audio=None,
-        reference_text="",
-        max_new_tokens=0,
-        chunk_length=0,
-        top_p=0.7,
-        repetition_penalty=1.5,
-        temperature=0.7,
-        speaker=None,
     )
     logger.info("Warming up done, launching the web UI...")
     app = build_app()
-    app.launch(show_api=False)

 # Download if not exists
 os.makedirs("checkpoints", exist_ok=True)
+snapshot_download(repo_id="fishaudio/fish-speech-1.2-sft", local_dir="./checkpoints/fish-speech-1.2")
 print("All checkpoints downloaded")
 HEADER_MD = """# Fish Speech
+## The demo in this space is version 1.2, Please check [Fish Audio](https://fish.audio) for the best model.
+## 该 Demo 为 Fish Speech 1.2 版本, 请在 [Fish Audio](https://fish.audio) 体验最新 DEMO.
 A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).
 由 [Fish Audio](https://fish.audio) 研发的基于 VQ-GAN 和 Llama 的多语种语音合成.
 You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).
 你可以在 [这里](https://github.com/fishaudio/fish-speech) 找到源代码和 [这里](https://huggingface.co/fishaudio/fish-speech-1) 找到模型.
+Related code and weights are released under CC BY-NC-SA 4.0 License.
+相关代码，权重使用 CC BY-NC-SA 4.0 许可证发布.
 We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.
 我们不对模型的任何滥用负责，请在使用之前考虑您当地的法律法规.
+The model running in this WebUI is Fish Speech V1.2 Medium SFT
+在此 WebUI 中运行的模型是 Fish Speech V1.2 Medium SFT
 """
 TEXTBOX_PLACEHOLDER = """Put your text here. 在此处输入文本."""
     top_p,
     repetition_penalty,
     temperature,
+    streaming=False,
 ):
     if args.max_gradio_length > 0 and len(text) > args.max_gradio_length:
+        return (
+            None,
+            None,
+            i18n("Text is too long, please keep it under {} characters.").format(
+                args.max_gradio_length
+            ),
         )
+    # Parse reference audio aka prompt
+    prompt_tokens = encode_reference(
+        decoder_model=decoder_model,
+        reference_audio=reference_audio,
+        enable_reference_audio=enable_reference_audio,
+    )
     # LLAMA Inference
     request = dict(
+        device=decoder_model.device,
         max_new_tokens=max_new_tokens,
         text=text,
         top_p=top_p,
         compile=args.compile,
         iterative_prompt=chunk_length > 0,
         chunk_length=chunk_length,
+        max_length=2048,
         prompt_tokens=prompt_tokens if enable_reference_audio else None,
         prompt_text=reference_text if enable_reference_audio else None,
     )
+    response_queue = queue.Queue()
+    llama_queue.put(
+        GenerateRequest(
+            request=request,
+            response_queue=response_queue,
+        )
     )
+    if streaming:
+        yield wav_chunk_header(), None, None
+    segments = []
     while True:
+        result: WrappedGenerateResponse = response_queue.get()
+        if result.status == "error":
+            yield None, None, build_html_error_message(result.response)
             break
+        result: GenerateResponse = result.response
+        if result.action == "next":
+            break
+        with torch.autocast(
+            device_type=(
+                "cpu"
+                if decoder_model.device.type == "mps"
+                else decoder_model.device.type
+            ),
+            dtype=args.precision,
+        ):
+            fake_audios = decode_vq_tokens(
+                decoder_model=decoder_model,
+                codes=result.codes,
+            )
+        fake_audios = fake_audios.float().cpu().numpy()
+        segments.append(fake_audios)
+        if streaming:
+            yield (fake_audios * 32768).astype(np.int16).tobytes(), None, None
+    if len(segments) == 0:
+        return (
+            None,
+            None,
+            build_html_error_message(
+                i18n("No audio generated, please check the input text.")
+            ),
+        )
+    # No matter streaming or not, we need to return the final audio
+    audio = np.concatenate(segments, axis=0)
+    yield None, (decoder_model.spec_transform.sample_rate, audio), None
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        gc.collect()
+def inference_with_auto_rerank(
+    text,
+    enable_reference_audio,
+    reference_audio,
+    reference_text,
+    max_new_tokens,
+    chunk_length,
+    top_p,
+    repetition_penalty,
+    temperature,
+    use_auto_rerank,
+    streaming=False,
+):
+    max_attempts = 2 if use_auto_rerank else 1
+    best_wer = float("inf")
+    best_audio = None
+    best_sample_rate = None
+    for attempt in range(max_attempts):
+        audio_generator = inference(
+            text,
+            enable_reference_audio,
+            reference_audio,
+            reference_text,
+            max_new_tokens,
+            chunk_length,
+            top_p,
+            repetition_penalty,
+            temperature,
+            streaming=False,
+        )
+        # 获取音频数据
+        for _ in audio_generator:
+            pass
+        _, (sample_rate, audio), message = _
+        if audio is None:
+            return None, None, message
+        if not use_auto_rerank:
+            return None, (sample_rate, audio), None
+        asr_result = batch_asr(asr_model, [audio], sample_rate)[0]
+        wer = calculate_wer(text, asr_result["text"])
+        if wer <= 0.3 and not asr_result["huge_gap"]:
+            return None, (sample_rate, audio), None
+        if wer < best_wer:
+            best_wer = wer
+            best_audio = audio
+            best_sample_rate = sample_rate
+        if attempt == max_attempts - 1:
+            break
+    return None, (best_sample_rate, best_audio), None
+inference_stream = partial(inference, streaming=True)
+n_audios = 4
+global_audio_list = []
+global_error_list = []
+def inference_wrapper(
+    text,
+    enable_reference_audio,
+    reference_audio,
+    reference_text,
+    max_new_tokens,
+    chunk_length,
+    top_p,
+    repetition_penalty,
+    temperature,
+    batch_infer_num,
+    if_load_asr_model,
+):
+    audios = []
+    errors = []
+    for _ in range(batch_infer_num):
+        result = inference_with_auto_rerank(
+            text,
+            enable_reference_audio,
+            reference_audio,
+            reference_text,
+            max_new_tokens,
+            chunk_length,
+            top_p,
+            repetition_penalty,
+            temperature,
+            if_load_asr_model,
+        )
+        _, audio_data, error_message = result
+        audios.append(
+            gr.Audio(value=audio_data if audio_data else None, visible=True),
+        )
+        errors.append(
+            gr.HTML(value=error_message if error_message else None, visible=True),
+        )
+    for _ in range(batch_infer_num, n_audios):
+        audios.append(
+            gr.Audio(value=None, visible=False),
+        )
+        errors.append(
+            gr.HTML(value=None, visible=False),
+        )
+    return None, *audios, *errors
+def wav_chunk_header(sample_rate=44100, bit_depth=16, channels=1):
+    buffer = io.BytesIO()
+    with wave.open(buffer, "wb") as wav_file:
+        wav_file.setnchannels(channels)
+        wav_file.setsampwidth(bit_depth // 8)
+        wav_file.setframerate(sample_rate)
+    wav_header_bytes = buffer.getvalue()
+    buffer.close()
+    return wav_header_bytes
+def normalize_text(user_input, use_normalization):
+    if use_normalization:
+        return ChnNormedText(raw_text=user_input).normalize()
+    else:
+        return user_input
+asr_model = None
+def change_if_load_asr_model(if_load):
+    global asr_model
+    if if_load:
+        gr.Warning("Loading faster whisper model...")
+        if asr_model is None:
+            asr_model = load_model()
+        return gr.Checkbox(label="Unload faster whisper model", value=if_load)
+    if if_load is False:
+        gr.Warning("Unloading faster whisper model...")
+        del asr_model
+        asr_model = None
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            gc.collect()
+        return gr.Checkbox(label="Load faster whisper model", value=if_load)
+def change_if_auto_label(if_load, if_auto_label, enable_ref, ref_audio, ref_text):
+    if if_load and asr_model is not None:
+        if (
+            if_auto_label
+            and enable_ref
+            and ref_audio is not None
+            and ref_text.strip() == ""
+        ):
+            data, sample_rate = librosa.load(ref_audio)
+            res = batch_asr(asr_model, [data], sample_rate)[0]
+            ref_text = res["text"]
+    else:
+        gr.Warning("Whisper model not loaded!")
+    return gr.Textbox(value=ref_text)
 def build_app():
         app.load(
             None,
             None,
+            js="() => {const params = new URLSearchParams(window.location.search);if (!params.has('__theme')) {params.set('__theme', '%s');window.location.search = params.toString();}}"
+            % args.theme,
         )
         # Inference
         with gr.Row():
             with gr.Column(scale=3):
                 text = gr.Textbox(
+                    label=i18n("Input Text"), placeholder=TEXTBOX_PLACEHOLDER, lines=10
+                )
+                refined_text = gr.Textbox(
+                    label=i18n("Realtime Transform Text"),
+                    placeholder=i18n(
+                        "Normalization Result Preview (Currently Only Chinese)"
+                    ),
+                    lines=5,
+                    interactive=False,
                 )
                 with gr.Row():
+                    if_refine_text = gr.Checkbox(
+                        label=i18n("Text Normalization"),
+                        value=True,
+                        scale=1,
+                    )
+                    if_load_asr_model = gr.Checkbox(
+                        label=i18n("Load / Unload ASR model for auto-reranking"),
+                        value=False,
+                        scale=3,
+                    )
+                with gr.Row():
+                    with gr.Tab(label=i18n("Advanced Config")):
                         chunk_length = gr.Slider(
+                            label=i18n("Iterative Prompt Length, 0 means off"),
                             minimum=0,
+                            maximum=500,
+                            value=100,
                             step=8,
                         )
                         max_new_tokens = gr.Slider(
+                            label=i18n("Maximum tokens per batch, 0 means no limit"),
+                            minimum=0,
+                            maximum=2048,
+                            value=1024,  # 0 means no limit
                             step=8,
                         )
                         top_p = gr.Slider(
+                            label="Top-P",
+                            minimum=0.6,
+                            maximum=0.9,
+                            value=0.7,
+                            step=0.01,
                         )
                         repetition_penalty = gr.Slider(
+                            label=i18n("Repetition Penalty"),
+                            minimum=1,
+                            maximum=1.5,
+                            value=1.2,
                             step=0.01,
                         )
                         temperature = gr.Slider(
                             label="Temperature",
+                            minimum=0.6,
+                            maximum=0.9,
                             value=0.7,
                             step=0.01,
                         )
+                    with gr.Tab(label=i18n("Reference Audio")):
                         gr.Markdown(
+                            i18n(
+                                "5 to 10 seconds of reference audio, useful for specifying speaker."
+                            )
                         )
                         enable_reference_audio = gr.Checkbox(
+                            label=i18n("Enable Reference Audio"),
                         )
                         reference_audio = gr.Audio(
+                            label=i18n("Reference Audio"),
                             type="filepath",
                         )
+                        with gr.Row():
+                            if_auto_label = gr.Checkbox(
+                                label=i18n("Auto Labeling"),
+                                min_width=100,
+                                scale=0,
+                                value=False,
+                            )
+                            reference_text = gr.Textbox(
+                                label=i18n("Reference Text"),
+                                lines=1,
+                                placeholder="在一无所知中，梦里的一天结束了，一个新的「轮回」便会开始。",
+                                value="",
+                            )
+                    with gr.Tab(label=i18n("Batch Inference")):
+                        batch_infer_num = gr.Slider(
+                            label="Batch infer nums",
+                            minimum=1,
+                            maximum=n_audios,
+                            step=1,
+                            value=1,
                         )
             with gr.Column(scale=3):
+                for _ in range(n_audios):
+                    with gr.Row():
+                        error = gr.HTML(
+                            label=i18n("Error Message"),
+                            visible=True if _ == 0 else False,
+                        )
+                        global_error_list.append(error)
+                    with gr.Row():
+                        audio = gr.Audio(
+                            label=i18n("Generated Audio"),
+                            type="numpy",
+                            interactive=False,
+                            visible=True if _ == 0 else False,
+                        )
+                        global_audio_list.append(audio)
+                with gr.Row():
+                    stream_audio = gr.Audio(
+                        label=i18n("Streaming Audio"),
+                        streaming=True,
+                        autoplay=True,
+                        interactive=False,
+                        show_download_button=True,
+                    )
                 with gr.Row():
                     with gr.Column(scale=3):
                         generate = gr.Button(
+                            value="\U0001F3A7 " + i18n("Generate"), variant="primary"
+                        )
+                        generate_stream = gr.Button(
+                            value="\U0001F3A7 " + i18n("Streaming Generate"),
+                            variant="primary",
                         )
+        text.input(
+            fn=normalize_text, inputs=[text, if_refine_text], outputs=[refined_text]
+        )
+        if_load_asr_model.change(
+            fn=change_if_load_asr_model,
+            inputs=[if_load_asr_model],
+            outputs=[if_load_asr_model],
+        )
+        if_auto_label.change(
+            fn=lambda: gr.Textbox(value=""),
+            inputs=[],
+            outputs=[reference_text],
+        ).then(
+            fn=change_if_auto_label,
+            inputs=[
+                if_load_asr_model,
+                if_auto_label,
+                enable_reference_audio,
+                reference_audio,
+                reference_text,
+            ],
+            outputs=[reference_text],
+        )
         # # Submit
         generate.click(
+            inference_wrapper,
             [
+                refined_text,
                 enable_reference_audio,
                 reference_audio,
                 reference_text,
                 top_p,
                 repetition_penalty,
                 temperature,
+                batch_infer_num,
+                if_load_asr_model,
             ],
+            [stream_audio, *global_audio_list, *global_error_list],
             concurrency_limit=1,
         )
+        generate_stream.click(
+            inference_stream,
+            [
+                refined_text,
+                enable_reference_audio,
+                reference_audio,
+                reference_text,
+                max_new_tokens,
+                chunk_length,
+                top_p,
+                repetition_penalty,
+                temperature,
+            ],
+            [stream_audio, global_audio_list[0], global_error_list[0]],
+            concurrency_limit=10,
+        )
     return app
     parser.add_argument(
         "--llama-checkpoint-path",
         type=Path,
+        default="checkpoints/fish-speech-1.2-sft",
     )
     parser.add_argument(
+        "--decoder-checkpoint-path",
         type=Path,
+        default="checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth",
     )
+    parser.add_argument("--decoder-config-name", type=str, default="firefly_gan_vq")
     parser.add_argument("--device", type=str, default="cuda")
     parser.add_argument("--half", action="store_true")
     parser.add_argument("--compile", action="store_true")
     parser.add_argument("--max-gradio-length", type=int, default=0)
+    parser.add_argument("--theme", type=str, default="light")
     return parser.parse_args()
 if __name__ == "__main__":
     args = parse_args()
     args.precision = torch.half if args.half else torch.bfloat16
     logger.info("Loading Llama model...")
     llama_queue = launch_thread_safe_queue(
         checkpoint_path=args.llama_checkpoint_path,
         device=args.device,
         precision=args.precision,
         compile=args.compile,
     )
     logger.info("Llama model loaded, loading VQ-GAN model...")
+    decoder_model = load_decoder_model(
+        config_name=args.decoder_config_name,
+        checkpoint_path=args.decoder_checkpoint_path,
         device=args.device,
     )
+    logger.info("Decoder model loaded, warming up...")
     # Dry run to check if the model is loaded correctly and avoid the first-time latency
+    list(
+        inference(
+            text="Hello, world!",
+            enable_reference_audio=False,
+            reference_audio=None,
+            reference_text="",
+            max_new_tokens=0,
+            chunk_length=100,
+            top_p=0.7,
+            repetition_penalty=1.2,
+            temperature=0.7,
+        )
     )
     logger.info("Warming up done, launching the web UI...")
     app = build_app()
+    app.launch(show_api=True)