NexaAIDev
/

Squid

@@ -48,36 +48,29 @@ from transformers import AutoTokenizer
 from configuration_dolphin import DolphinForCausalLM
 import time
-AutoConfig.register("dolphin", DolphinConfig)
-AutoModelForCausalLM.register(DolphinConfig, DolphinForCausalLM)
-MEMORY_SIZE = 32
-def inference_instruct(mycontext, device = "cuda:0"):
     import time
-    start = time.time()
     generated_token_ids = []
-    prompt = " <context>Who and when founded the Shanda group?"
-    print("input prompt: " + prompt)
-    print("input context: " + mycontext)
     text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<context>")]
     input_ids = (
-        torch.tensor(text_chunks[0] + [-1] * MEMORY_SIZE + text_chunks[1], dtype=torch.long)
         .unsqueeze(0)
         .to(device)
     )
-    # print(input_ids)
     # to process the context
     context_tokenized = tokenizer(
         mycontext + "".join([f"[memory_{i}]" for i in range(MEMORY_SIZE)]),
         return_tensors="pt",
     )
     context_tokenized = {k: v.to(device) for k, v in context_tokenized.items()}
-    # print(context_tokenized["input_ids"])
-    context_token_count = (context_tokenized["input_ids"]).shape[1] - MEMORY_SIZE
-    print("length of context: " + str(context_token_count) + " tokens")
     # We conduct a inference process
     for i in range(context_token_count):
-        print(f"\rGenerating token {i+1}/{context_token_count}", end="")
         next_token = (
             model(
                 input_ids,
@@ -91,21 +84,27 @@ def inference_instruct(mycontext, device = "cuda:0"):
             break
         generated_token_ids.append(next_token.item())
         input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=-1)
-    print("\noutput: " + tokenizer.decode(generated_token_ids))
-    end = time.time()
-    print(f"Elapsed time: {end - start:.2f}s")
-# Load the tokenizer and model
-tokenizer = AutoTokenizer.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True)
-# Run inference example
-mycontext = "Nexa AI is a Cupertino-based company founded in May 2023 that researches and develops models and tools for on-device AI applications. The company is founded by Alex and Zack. The company is known for its Octopus-series models, which rival large-scale language models in capabilities such as function-calling, multimodality, and action-planning, while remaining efficient and compact for edge device deployment. Nexa AI's mission is to advance on-device AI in collaboration with the global developer community. To this end, the company has created an on-device model hub for users to find, share, and collaborate on open-source AI models optimized for edge devices, as well as an SDK for developers to run and deploy AI models locally"
-inference_instruct(mycontext, "who founded Nexa AI?")
-inference_instruct(mycontext, "what is the mission of Nexa AI?")
-inference_instruct(mycontext, "what is the performance of Octopus V2 and V3?")
-inference_instruct(mycontext, "when is Nexa AI founded?")
 ```
 ## Training Process

 from configuration_dolphin import DolphinForCausalLM
 import time
+def inference_instruct(mycontext, question, device="cuda:0"):
     import time
+    MEMORY_SIZE = 32
+    start_time = time.time()
     generated_token_ids = []
+    prompt = f" <context>{question}"
     text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<context>")]
     input_ids = (
+        torch.tensor(
+            text_chunks[0] + [-1] * MEMORY_SIZE + text_chunks[1], dtype=torch.long
+        )
         .unsqueeze(0)
         .to(device)
     )
     # to process the context
     context_tokenized = tokenizer(
         mycontext + "".join([f"[memory_{i}]" for i in range(MEMORY_SIZE)]),
         return_tensors="pt",
     )
     context_tokenized = {k: v.to(device) for k, v in context_tokenized.items()}
+    context_token_count = (context_tokenized["input_ids"]).shape[1] - MEMORY_SIZE
     # We conduct a inference process
     for i in range(context_token_count):
         next_token = (
             model(
                 input_ids,
             break
         generated_token_ids.append(next_token.item())
         input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=-1)
+    result = tokenizer.decode(generated_token_ids)
+    print(f"Time taken: {time.time() - start_time}")
+    return result
+if __name__ == "__main__":
+    # Register your configuration and model
+    AutoConfig.register("dolphin", DolphinConfig)
+    AutoModelForCausalLM.register(DolphinConfig, DolphinForCausalLM)
+    device_name = "cuda:0" if torch.cuda.is_available() else "cpu"
+    # Load the tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True, torch_dtype=torch.bfloat16, device_map=device_name)
+    # Run inference example
+    mycontext = "Nexa AI is a Cupertino-based company founded in May 2023 that researches and develops models and tools for on-device AI applications. The company is founded by Alex and Zack. The company is known for its Octopus-series models, which rival large-scale language models in capabilities such as function-calling, multimodality, and action-planning, while remaining efficient and compact for edge device deployment. Nexa AI's mission is to advance on-device AI in collaboration with the global developer community. To this end, the company has created an on-device model hub for users to find, share, and collaborate on open-source AI models optimized for edge devices, as well as an SDK for developers to run and deploy AI models locally"
+    question = "Who founded Nexa AI?"
+    # Pass the context and the correct device string
+    result = inference_instruct(mycontext, question, device=device_name)
+    print("Result:", result)
 ```
 ## Training Process