Spaces:

fishaudio
/

fish-speech-1

Running on A10G

App Files Files Community

lengyue233 commited on May 2

Commit

9bfe4ad

•

1 Parent(s): e90b8b5

Optimize graph

Browse files

Files changed (2) hide show

app.py +5 -10
tools/llama/generate.py +37 -26

app.py CHANGED Viewed

@@ -41,6 +41,9 @@ Related code are released under BSD-3-Clause License, and weights are released u
 We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.
 我们不对模型的任何滥用负责，请在使用之前考虑您当地的法律法规.
 """
 TEXTBOX_PLACEHOLDER = """Put your text here. 在此处输入文本."""
@@ -76,7 +79,6 @@ def inference(
     reference_text,
     max_new_tokens,
     chunk_length,
-    top_k,
     top_p,
     repetition_penalty,
     temperature,
@@ -112,7 +114,6 @@ def inference(
         device=vqgan_model.device,
         max_new_tokens=max_new_tokens,
         text=text,
-        top_k=int(top_k) if top_k > 0 else None,
         top_p=top_p,
         repetition_penalty=repetition_penalty,
         temperature=temperature,
@@ -194,10 +195,6 @@ def build_app():
                             step=8,
                         )
-                        top_k = gr.Slider(
-                            label="Top-K", minimum=0, maximum=5, value=0, step=1
-                        )
                         top_p = gr.Slider(
                             label="Top-P", minimum=0, maximum=1, value=0.7, step=0.01
                         )
@@ -264,7 +261,6 @@ def build_app():
                 reference_text,
                 max_new_tokens,
                 chunk_length,
-                top_k,
                 top_p,
                 repetition_penalty,
                 temperature,
@@ -310,8 +306,8 @@ if __name__ == "__main__":
     args.compile = True
     args.max_gradio_length = 1024
     args.tokenizer = "./checkpoints/fish-speech-1"
-    args.llama_checkpoint_path = "./checkpoints/fish-speech-1/text2semantic-sft-large-v1-4k.pth"
-    args.llama_config_name = "dual_ar_2_codebook_large"
     args.vqgan_checkpoint_path = "./checkpoints/fish-speech-1/vq-gan-group-fsq-2x1024.pth"
     args.vqgan_config_name = "vqgan_pretrain"
@@ -343,7 +339,6 @@ if __name__ == "__main__":
         reference_text="",
         max_new_tokens=0,
         chunk_length=0,
-        top_k=0,  # 0 means no limit
         top_p=0.7,
         repetition_penalty=1.5,
         temperature=0.7,

 We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.
 我们不对模型的任何滥用负责，请在使用之前考虑您当地的法律法规.
+The model running in this WebUI is Fish Speech V1 Medium SFT 4K.
+在此 WebUI 中运行的模型是 Fish Speech V1 Medium SFT 4K.
 """
 TEXTBOX_PLACEHOLDER = """Put your text here. 在此处输入文本."""
     reference_text,
     max_new_tokens,
     chunk_length,
     top_p,
     repetition_penalty,
     temperature,
         device=vqgan_model.device,
         max_new_tokens=max_new_tokens,
         text=text,
         top_p=top_p,
         repetition_penalty=repetition_penalty,
         temperature=temperature,
                             step=8,
                         )
                         top_p = gr.Slider(
                             label="Top-P", minimum=0, maximum=1, value=0.7, step=0.01
                         )
                 reference_text,
                 max_new_tokens,
                 chunk_length,
                 top_p,
                 repetition_penalty,
                 temperature,
     args.compile = True
     args.max_gradio_length = 1024
     args.tokenizer = "./checkpoints/fish-speech-1"
+    args.llama_checkpoint_path = "./checkpoints/fish-speech-1/text2semantic-sft-medium-v1-4k.pth"
+    args.llama_config_name = "dual_ar_2_codebook_medium"
     args.vqgan_checkpoint_path = "./checkpoints/fish-speech-1/vq-gan-group-fsq-2x1024.pth"
     args.vqgan_config_name = "vqgan_pretrain"
         reference_text="",
         max_new_tokens=0,
         chunk_length=0,
         top_p=0.7,
         repetition_penalty=1.5,
         temperature=0.7,

tools/llama/generate.py CHANGED Viewed

@@ -42,11 +42,11 @@ def multinomial_sample_one_no_sync(
 def logits_to_probs(
     logits,
     previous_tokens: Optional[torch.Tensor] = None,
-    temperature: float = 1.0,
-    top_k: Optional[int] = None,
-    top_p: Optional[int] = None,
-    repetition_penalty: float = 1.0,
-):
     if previous_tokens is not None:
         previous_tokens = previous_tokens.long()
         score = torch.gather(logits, dim=0, index=previous_tokens)
@@ -55,11 +55,9 @@ def logits_to_probs(
         )
         logits.scatter_(dim=0, index=previous_tokens, src=score)
-    # if top_p is not None and top_p < 1.0:
     sorted_logits, sorted_indices = torch.sort(logits, descending=True)
-    cum_probs = torch.cumsum(
-        torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1
-    )
     sorted_indices_to_remove = cum_probs > top_p
     sorted_indices_to_remove[0] = False  # keep at least one option
     indices_to_remove = sorted_indices_to_remove.scatter(
@@ -69,11 +67,6 @@ def logits_to_probs(
     logits = logits / max(temperature, 1e-5)
-    # if top_k is not None:
-    #     v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
-    #     pivot = v.select(-1, -1).unsqueeze(-1)
-    #     logits = torch.where(logits < pivot, -float("Inf"), logits)
     probs = torch.nn.functional.softmax(logits, dim=-1)
     return probs
@@ -449,7 +442,6 @@ def generate_long(
     text: str,
     num_samples: int = 1,
     max_new_tokens: int = 0,
-    top_k: int = None,
     top_p: int = 0.7,
     repetition_penalty: float = 1.5,
     temperature: float = 0.7,
@@ -462,6 +454,10 @@ def generate_long(
     prompt_tokens: Optional[torch.Tensor] = None,
     is_streaming: bool = False,
 ):
     model_size = sum(p.numel() for p in model.parameters() if p.requires_grad)
     im_end_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
@@ -493,8 +489,18 @@ def generate_long(
         )
         logger.info(f"Encoded text: {text}")
     for sample_idx in range(num_samples):
-        torch.cuda.synchronize()
         global_encoded = []
         all_codes = []
         seg_idx = 0
@@ -540,7 +546,6 @@ def generate_long(
                 im_end_id=im_end_id,
                 decode_one_token=decode_one_token,
                 temperature=temperature,
-                top_k=top_k,
                 top_p=top_p,
                 repetition_penalty=repetition_penalty,
             )
@@ -548,7 +553,9 @@ def generate_long(
             if sample_idx == 0 and seg_idx == 0 and compile:
                 logger.info(f"Compilation time: {time.perf_counter() - t0:.2f} seconds")
-            torch.cuda.synchronize()
             t = time.perf_counter() - t0
             tokens_generated = y.size(1) - prompt_length
@@ -559,9 +566,11 @@ def generate_long(
             logger.info(
                 f"Bandwidth achieved: {model_size * tokens_sec / 1e9:.02f} GB/s"
             )
-            logger.info(
-                f"GPU Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB"
-            )
             # Put the generated tokens
             # since there is <im_end> and <eos> tokens, we remove last 2 tokens
@@ -654,7 +663,6 @@ def launch_thread_safe_queue(
 )
 @click.option("--num-samples", type=int, default=1)
 @click.option("--max-new-tokens", type=int, default=0)
-@click.option("--top-k", type=int, default=None)
 @click.option("--top-p", type=float, default=0.7)
 @click.option("--repetition-penalty", type=float, default=1.5)
 @click.option("--temperature", type=float, default=0.7)
@@ -678,7 +686,6 @@ def main(
     prompt_tokens: Optional[Path],
     num_samples: int,
     max_new_tokens: int,
-    top_k: int,
     top_p: int,
     repetition_penalty: float,
     temperature: float,
@@ -702,7 +709,10 @@ def main(
     model, decode_one_token = load_model(
         config_name, checkpoint_path, device, precision, max_length, compile=compile
     )
-    torch.cuda.synchronize()
     logger.info(f"Time to load model: {time.time() - t0:.02f} seconds")
     prompt_tokens = (
@@ -713,7 +723,9 @@ def main(
     tokenizer = AutoTokenizer.from_pretrained(tokenizer)
     torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
     generator = generate_long(
         model=model,
@@ -722,7 +734,6 @@ def main(
         text=text,
         num_samples=num_samples,
         max_new_tokens=max_new_tokens,
-        top_k=top_k,
         top_p=top_p,
         repetition_penalty=repetition_penalty,
         temperature=temperature,

 def logits_to_probs(
     logits,
     previous_tokens: Optional[torch.Tensor] = None,
+    temperature: torch.Tensor = 1.0,
+    top_p: torch.Tensor = 1.0,
+    repetition_penalty: torch.Tensor = 1.0,
+) -> torch.Tensor:
+    # Apply repetition penalty
     if previous_tokens is not None:
         previous_tokens = previous_tokens.long()
         score = torch.gather(logits, dim=0, index=previous_tokens)
         )
         logits.scatter_(dim=0, index=previous_tokens, src=score)
+    # Apply top-p sampling
     sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+    cum_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1)
     sorted_indices_to_remove = cum_probs > top_p
     sorted_indices_to_remove[0] = False  # keep at least one option
     indices_to_remove = sorted_indices_to_remove.scatter(
     logits = logits / max(temperature, 1e-5)
     probs = torch.nn.functional.softmax(logits, dim=-1)
     return probs
     text: str,
     num_samples: int = 1,
     max_new_tokens: int = 0,
     top_p: int = 0.7,
     repetition_penalty: float = 1.5,
     temperature: float = 0.7,
     prompt_tokens: Optional[torch.Tensor] = None,
     is_streaming: bool = False,
 ):
+    assert 0 < top_p <= 1, "top_p must be in (0, 1]"
+    assert 0 < repetition_penalty < 2, "repetition_penalty must be in (0, 2)"
+    assert 0 < temperature < 2, "temperature must be in (0, 2)"
     model_size = sum(p.numel() for p in model.parameters() if p.requires_grad)
     im_end_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
         )
         logger.info(f"Encoded text: {text}")
+    # Move temperature, top_p, repetition_penalty to device
+    # This is important so that changing params doesn't trigger recompile
+    temperature = torch.tensor(temperature, device=device, dtype=torch.float)
+    top_p = torch.tensor(top_p, device=device, dtype=torch.float)
+    repetition_penalty = torch.tensor(
+        repetition_penalty, device=device, dtype=torch.float
+    )
     for sample_idx in range(num_samples):
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
         global_encoded = []
         all_codes = []
         seg_idx = 0
                 im_end_id=im_end_id,
                 decode_one_token=decode_one_token,
                 temperature=temperature,
                 top_p=top_p,
                 repetition_penalty=repetition_penalty,
             )
             if sample_idx == 0 and seg_idx == 0 and compile:
                 logger.info(f"Compilation time: {time.perf_counter() - t0:.2f} seconds")
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
             t = time.perf_counter() - t0
             tokens_generated = y.size(1) - prompt_length
             logger.info(
                 f"Bandwidth achieved: {model_size * tokens_sec / 1e9:.02f} GB/s"
             )
+            if torch.cuda.is_available():
+                logger.info(
+                    f"GPU Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB"
+                )
             # Put the generated tokens
             # since there is <im_end> and <eos> tokens, we remove last 2 tokens
 )
 @click.option("--num-samples", type=int, default=1)
 @click.option("--max-new-tokens", type=int, default=0)
 @click.option("--top-p", type=float, default=0.7)
 @click.option("--repetition-penalty", type=float, default=1.5)
 @click.option("--temperature", type=float, default=0.7)
     prompt_tokens: Optional[Path],
     num_samples: int,
     max_new_tokens: int,
     top_p: int,
     repetition_penalty: float,
     temperature: float,
     model, decode_one_token = load_model(
         config_name, checkpoint_path, device, precision, max_length, compile=compile
     )
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
     logger.info(f"Time to load model: {time.time() - t0:.02f} seconds")
     prompt_tokens = (
     tokenizer = AutoTokenizer.from_pretrained(tokenizer)
     torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
     generator = generate_long(
         model=model,
         text=text,
         num_samples=num_samples,
         max_new_tokens=max_new_tokens,
         top_p=top_p,
         repetition_penalty=repetition_penalty,
         temperature=temperature,