Spaces:

fishaudio
/

fish-speech-1

Running on A10G

App Files Files Community

lengyue233 commited on Apr 25

Commit

662d788

•

1 Parent(s): 2f06fba

Enable compile on A10G

Browse files

Files changed (2) hide show

app.py +2 -2
tools/llama/generate.py +37 -23

app.py CHANGED Viewed

@@ -251,7 +251,7 @@ def build_app():
                 # speaker,
             ],
             [audio, error],
-            # concurrency_limit=1,
         )
     return app
@@ -287,7 +287,7 @@ if __name__ == "__main__":
     args = parse_args()
     args.precision = torch.half if args.half else torch.bfloat16
-    # args.compile = True
     logger.info("Loading Llama model...")
     llama_model, decode_one_token = load_llama_model(

                 # speaker,
             ],
             [audio, error],
+            concurrency_limit=1,
         )
     return app
     args = parse_args()
     args.precision = torch.half if args.half else torch.bfloat16
+    args.compile = True
     logger.info("Loading Llama model...")
     llama_model, decode_one_token = load_llama_model(

tools/llama/generate.py CHANGED Viewed

@@ -14,7 +14,7 @@ from loguru import logger
 from tqdm import tqdm
 from transformers import AutoTokenizer
-from fish_speech.datasets.text import CODEBOOK_EOS_TOKEN_ID
 from fish_speech.text.clean import clean_text
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -291,11 +291,11 @@ def encode_tokens(
 ):
     string = clean_text(string)
-    if speaker is not None:
-        string = f"[SPK: {speaker}] {string}"
     string = (
-        f"<|im_start|>user<|im_sep|>{string}<|im_end|><|im_start|>assistant<|im_sep|>"
     )
     if bos:
         string = f"<|begin_of_sequence|>{string}"
@@ -309,7 +309,10 @@ def encode_tokens(
     tokens = torch.tensor([new_tokens], dtype=torch.int, device=device)
     # Codebooks
-    zeros = torch.zeros((num_codebooks, tokens.size(1)), dtype=torch.int, device=device)
     prompt = torch.cat((tokens, zeros), dim=0)
     if prompt_tokens is None:
@@ -331,13 +334,23 @@ def encode_tokens(
         )
         data = data[:num_codebooks]
     # Since 1.0, we use <|semantic|>
     s0_token_id = tokenizer.convert_tokens_to_ids("<|semantic|>")
-    main_token_ids = torch.tensor(
-        [[s0_token_id] * data.size(1)],
-        dtype=torch.int,
-        device=device,
     )
     data = torch.cat((main_token_ids, data), dim=0)
     prompt = torch.cat((prompt, data), dim=1)
@@ -450,6 +463,20 @@ def generate_long(
     use_prompt = prompt_text is not None and prompt_tokens is not None
     encoded = []
     texts = split_text(text, chunk_length) if iterative_prompt else [text]
     for idx, text in enumerate(texts):
         encoded.append(
             encode_tokens(
@@ -457,25 +484,12 @@ def generate_long(
                 string=text,
                 bos=idx == 0 and not use_prompt,
                 device=device,
-                speaker=None,
                 num_codebooks=model.config.num_codebooks,
             )
         )
         logger.info(f"Encoded text: {text}")
-    if use_prompt:
-        encoded_prompt = encode_tokens(
-            tokenizer,
-            prompt_text,
-            prompt_tokens=prompt_tokens,
-            bos=True,
-            device=device,
-            speaker=speaker,
-            num_codebooks=model.config.num_codebooks,
-        )
-        encoded[0] = torch.cat((encoded_prompt, encoded[0]), dim=1)
     for sample_idx in range(num_samples):
         torch.cuda.synchronize()
         global_encoded = []

 from tqdm import tqdm
 from transformers import AutoTokenizer
+from fish_speech.datasets.text import CODEBOOK_EOS_TOKEN_ID, CODEBOOK_PAD_TOKEN_ID
 from fish_speech.text.clean import clean_text
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 ):
     string = clean_text(string)
+    if speaker is None:
+        speaker = "assistant"
     string = (
+        f"<|im_start|>user<|im_sep|>{string}<|im_end|><|im_start|>{speaker}<|im_sep|>"
     )
     if bos:
         string = f"<|begin_of_sequence|>{string}"
     tokens = torch.tensor([new_tokens], dtype=torch.int, device=device)
     # Codebooks
+    zeros = (
+        torch.ones((num_codebooks, tokens.size(1)), dtype=torch.int, device=device)
+        * CODEBOOK_PAD_TOKEN_ID
+    )
     prompt = torch.cat((tokens, zeros), dim=0)
     if prompt_tokens is None:
         )
         data = data[:num_codebooks]
+    # Add eos token for each codebook
+    data = torch.cat(
+        (
+            data,
+            torch.ones((data.size(0), 1), dtype=torch.int, device=device)
+            * CODEBOOK_EOS_TOKEN_ID,
+        ),
+        dim=1,
+    )
     # Since 1.0, we use <|semantic|>
     s0_token_id = tokenizer.convert_tokens_to_ids("<|semantic|>")
+    end_token_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
+    main_token_ids = (
+        torch.ones((1, data.size(1)), dtype=torch.int, device=device) * s0_token_id
     )
+    main_token_ids[0, -1] = end_token_id
     data = torch.cat((main_token_ids, data), dim=0)
     prompt = torch.cat((prompt, data), dim=1)
     use_prompt = prompt_text is not None and prompt_tokens is not None
     encoded = []
     texts = split_text(text, chunk_length) if iterative_prompt else [text]
+    if use_prompt:
+        encoded.append(
+            encode_tokens(
+                tokenizer,
+                prompt_text,
+                prompt_tokens=prompt_tokens,
+                bos=True,
+                device=device,
+                speaker=speaker,
+                num_codebooks=model.config.num_codebooks,
+            )
+        )
     for idx, text in enumerate(texts):
         encoded.append(
             encode_tokens(
                 string=text,
                 bos=idx == 0 and not use_prompt,
                 device=device,
+                speaker=speaker,
                 num_codebooks=model.config.num_codebooks,
             )
         )
         logger.info(f"Encoded text: {text}")
     for sample_idx in range(num_samples):
         torch.cuda.synchronize()
         global_encoded = []