|
import sys |
|
import os |
|
sys.path.append(os.path.dirname(os.path.abspath(__file__))) |
|
from configuration_dolphin import DolphinConfig |
|
from modeling_dolphin import DolphinForCausalLM |
|
from transformers import (AutoTokenizer, AutoModelForCausalLM, AutoConfig) |
|
import torch |
|
|
|
def inference_instruct(mycontext, question, device="cuda:0"): |
|
import time |
|
MEMORY_SIZE = 32 |
|
start_time = time.time() |
|
generated_token_ids = [] |
|
prompt = f" <context>{question}" |
|
text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<context>")] |
|
input_ids = ( |
|
torch.tensor( |
|
text_chunks[0] + [-1] * MEMORY_SIZE + text_chunks[1], dtype=torch.long |
|
) |
|
.unsqueeze(0) |
|
.to(device) |
|
) |
|
|
|
context_tokenized = tokenizer( |
|
mycontext + "".join([f"[memory_{i}]" for i in range(MEMORY_SIZE)]), |
|
return_tensors="pt", |
|
) |
|
context_tokenized = {k: v.to(device) for k, v in context_tokenized.items()} |
|
context_token_count = (context_tokenized["input_ids"]).shape[1] - MEMORY_SIZE |
|
|
|
for i in range(context_token_count): |
|
next_token = ( |
|
model( |
|
input_ids, |
|
context_input_ids=context_tokenized["input_ids"], |
|
context_attention_mask=context_tokenized["attention_mask"], |
|
) |
|
.logits[:, -1] |
|
.argmax(-1) |
|
) |
|
if next_token.item() == 151643: |
|
break |
|
generated_token_ids.append(next_token.item()) |
|
input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=-1) |
|
result = tokenizer.decode(generated_token_ids) |
|
print(f"Time taken: {time.time() - start_time}") |
|
return result |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
AutoConfig.register("dolphin", DolphinConfig) |
|
AutoModelForCausalLM.register(DolphinConfig, DolphinForCausalLM) |
|
device_name = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True) |
|
model = AutoModelForCausalLM.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True, torch_dtype=torch.bfloat16, device_map=device_name) |
|
|
|
|
|
mycontext = "Nexa AI is a Cupertino-based company founded in May 2023 that researches and develops models and tools for on-device AI applications. The company is founded by Alex and Zack. The company is known for its Octopus-series models, which rival large-scale language models in capabilities such as function-calling, multimodality, and action-planning, while remaining efficient and compact for edge device deployment. Nexa AI's mission is to advance on-device AI in collaboration with the global developer community. To this end, the company has created an on-device model hub for users to find, share, and collaborate on open-source AI models optimized for edge devices, as well as an SDK for developers to run and deploy AI models locally" |
|
question = "Who founded Nexa AI?" |
|
|
|
result = inference_instruct(mycontext, question, device=device_name) |
|
print("Result:", result) |