NexaAIDev
/

Squid

@@ -48,21 +48,64 @@ from transformers import AutoTokenizer
 from configuration_dolphin import DolphinForCausalLM
 import time
-tokenizer = AutoTokenizer.from_pretrained('nexa-collaboration/dolphin_instruct_1M_0805', trust_remote_code=True)
-model = DolphinForCausalLM.from_pretrained('nexa-collaboration/dolphin_instruct_1M_0805', trust_remote_code=True)
-def inference(input_text):
-    inputs = tokenizer(input_text, return_tensors="pt")
-    outputs = model.generate(**inputs, max_new_tokens=100)
-    return tokenizer.decode(outputs[0], skip_special_tokens=True)
-input_text = "Take a selfie for me with front camera"
-nexa_query = f"Below is the query from the users, please call the correct function and generate the parameters to call the function.\n\nQuery: {input_text} \n\nResponse:"
-start_time = time.time()
-result = inference(nexa_query)
-print("Dolphin model result:\n", result)
-print("Latency:", time.time() - start_time, "s")
 ```
 ## Training Process

 from configuration_dolphin import DolphinForCausalLM
 import time
+AutoConfig.register("dolphin", DolphinConfig)
+AutoModelForCausalLM.register(DolphinConfig, DolphinForCausalLM)
+MEMORY_SIZE = 32
+def inference_instruct(mycontext, device = "cuda:0"):
+    import time
+    start = time.time()
+    generated_token_ids = []
+    prompt = " <context>Who and when founded the Shanda group?"
+    print("input prompt: " + prompt)
+    print("input context: " + mycontext)
+    text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<context>")]
+    input_ids = (
+        torch.tensor(text_chunks[0] + [-1] * MEMORY_SIZE + text_chunks[1], dtype=torch.long)
+        .unsqueeze(0)
+        .to(device)
+    )
+    # print(input_ids)
+    # to process the context
+    context_tokenized = tokenizer(
+        mycontext + "".join([f"[memory_{i}]" for i in range(MEMORY_SIZE)]),
+        return_tensors="pt",
+    )
+    context_tokenized = {k: v.to(device) for k, v in context_tokenized.items()}
+    # print(context_tokenized["input_ids"])
+    context_token_count = (context_tokenized["input_ids"]).shape[1] - MEMORY_SIZE
+    print("length of context: " + str(context_token_count) + " tokens")
+    # We conduct a inference process
+    for i in range(context_token_count):
+        print(f"\rGenerating token {i+1}/{context_token_count}", end="")
+        next_token = (
+            model(
+                input_ids,
+                context_input_ids=context_tokenized["input_ids"],
+                context_attention_mask=context_tokenized["attention_mask"],
+            )
+            .logits[:, -1]
+            .argmax(-1)
+        )
+        if next_token.item() == 151643:
+            break
+        generated_token_ids.append(next_token.item())
+        input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=-1)
+    print("\noutput: " + tokenizer.decode(generated_token_ids))
+    end = time.time()
+    print(f"Elapsed time: {end - start:.2f}s")
+# Load the tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True)
+# Run inference example
+mycontext = "Nexa AI is a Cupertino-based company founded in May 2023 that researches and develops models and tools for on-device AI applications. The company is founded by Alex and Zack. The company is known for its Octopus-series models, which rival large-scale language models in capabilities such as function-calling, multimodality, and action-planning, while remaining efficient and compact for edge device deployment. Nexa AI's mission is to advance on-device AI in collaboration with the global developer community. To this end, the company has created an on-device model hub for users to find, share, and collaborate on open-source AI models optimized for edge devices, as well as an SDK for developers to run and deploy AI models locally"
+inference_instruct(mycontext, "who founded Nexa AI?")
+inference_instruct(mycontext, "what is the mission of Nexa AI?")
+inference_instruct(mycontext, "what is the performance of Octopus V2 and V3?")
+inference_instruct(mycontext, "when is Nexa AI founded?")
 ```
 ## Training Process

modeling_dolphin.py CHANGED Viewed

@@ -15,13 +15,14 @@ from transformers.modeling_attn_mask_utils import (
     AttentionMaskConverter,
 )
 from transformers.models.qwen2.modeling_qwen2 import Qwen2DecoderLayer
 import torch
 import torch.nn as nn
 from typing import List, Optional, Tuple, Union
 import warnings
 from dataclasses import dataclass
 from torch.nn import CrossEntropyLoss
-from .configuration_dolphin import encoder_config_dict, Qwen2Config
 CONTEXT_EMB = 896  # Qwen 0.7B has dimension of 896
 HIDDEN_EMB = 3584  # Qwen 7B has dimension of 3584
@@ -187,7 +188,7 @@ class DolphinModel(Qwen2PreTrainedModel):
     """
     config_class = DolphinConfig
-    def __init__(self, config: Qwen2Config):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
@@ -731,17 +732,63 @@ class DolphinForCausalLM(Qwen2PreTrainedModel):
             )
         return reordered_past
 if __name__ == "__main__":
     # Register your configuration and model
     AutoConfig.register("dolphin", DolphinConfig)
     AutoModelForCausalLM.register(DolphinConfig, DolphinForCausalLM)
-    # Ensure that your config is loaded with the correct model type
-    config = DolphinConfig(encoder_config=encoder_config_dict)
-    # Save the configuration (if needed)
-    config.save_pretrained("dolphin")
     # Load the tokenizer and model
-    tokenizer = AutoTokenizer.from_pretrained('NexaAIDev/Dolphin', trust_remote=True)
-    model = AutoModelForCausalLM.from_pretrained('NexaAIDev/Dolphin', config=config, trust_remote=True)

     AttentionMaskConverter,
 )
 from transformers.models.qwen2.modeling_qwen2 import Qwen2DecoderLayer
+from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
 import torch
 import torch.nn as nn
 from typing import List, Optional, Tuple, Union
 import warnings
 from dataclasses import dataclass
 from torch.nn import CrossEntropyLoss
+from .configuration_dolphin import encoder_config_dict, DolphinConfig
 CONTEXT_EMB = 896  # Qwen 0.7B has dimension of 896
 HIDDEN_EMB = 3584  # Qwen 7B has dimension of 3584
     """
     config_class = DolphinConfig
+    def __init__(self, config: DolphinConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
             )
         return reordered_past
+MEMORY_SIZE = 32
+def inference_instruct(mycontext, device = "cuda:0"):
+    import time
+    start = time.time()
+    generated_token_ids = []
+    prompt = " <context>Who and when founded the Shanda group?"
+    print("input prompt: " + prompt)
+    print("input context: " + mycontext)
+    text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<context>")]
+    input_ids = (
+        torch.tensor(text_chunks[0] + [-1] * MEMORY_SIZE + text_chunks[1], dtype=torch.long)
+        .unsqueeze(0)
+        .to(device)
+    )
+    # print(input_ids)
+    # to process the context
+    context_tokenized = tokenizer(
+        mycontext + "".join([f"[memory_{i}]" for i in range(MEMORY_SIZE)]),
+        return_tensors="pt",
+    )
+    context_tokenized = {k: v.to(device) for k, v in context_tokenized.items()}
+    # print(context_tokenized["input_ids"])
+    context_token_count = (context_tokenized["input_ids"]).shape[1] - MEMORY_SIZE
+    print("length of context: " + str(context_token_count) + " tokens")
+    # We conduct a inference process
+    for i in range(context_token_count):
+        print(f"\rGenerating token {i+1}/{context_token_count}", end="")
+        next_token = (
+            model(
+                input_ids,
+                context_input_ids=context_tokenized["input_ids"],
+                context_attention_mask=context_tokenized["attention_mask"],
+            )
+            .logits[:, -1]
+            .argmax(-1)
+        )
+        if next_token.item() == 151643:
+            break
+        generated_token_ids.append(next_token.item())
+        input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=-1)
+    print("\noutput: " + tokenizer.decode(generated_token_ids))
+    end = time.time()
+    print(f"Elapsed time: {end - start:.2f}s")
 if __name__ == "__main__":
     # Register your configuration and model
     AutoConfig.register("dolphin", DolphinConfig)
     AutoModelForCausalLM.register(DolphinConfig, DolphinForCausalLM)
     # Load the tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True)
+    # Run inference example
+    mycontext = "Nexa AI is a Cupertino-based company founded in May 2023 that researches and develops models and tools for on-device AI applications. The company is founded by Alex and Zack. The company is known for its Octopus-series models, which rival large-scale language models in capabilities such as function-calling, multimodality, and action-planning, while remaining efficient and compact for edge device deployment. Nexa AI's mission is to advance on-device AI in collaboration with the global developer community. To this end, the company has created an on-device model hub for users to find, share, and collaborate on open-source AI models optimized for edge devices, as well as an SDK for developers to run and deploy AI models locally"
+    inference_instruct(mycontext, "who founded Nexa AI?")
+    inference_instruct(mycontext, "what is the mission of Nexa AI?")
+    inference_instruct(mycontext, "what is the performance of Octopus V2 and V3?")
+    inference_instruct(mycontext, "when is Nexa AI founded?")