Edit model card

This model is trained from Mistral-7B-Instruct-V0.2 with 90% chinese dataset and 10% english dataset

github Web-UI

image/png

from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer,AutoTokenizer,AutoModelForCausalLM,MistralForCausalLM
import torch

model_id=Mistral-7B-Instruct-v0.4

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,torch_dtype=torch.bfloat16,device_map="auto",)

chat_template="{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"

def chat_format(conversation:list):
    system_prompt = "You are a helpful, respectful and honest assistant.Help humman as much as you can."
    
    id = tokenizer.apply_chat_template(conversation,chat_template=chat_template,tokenize=False)
    
    return id

user_chat=[{"role":"user","content":"你好,最近在干嘛呢"}]
text = chat_format(user_chat).rstrip("</s>")
def predict(content_prompt):
    inputs = tokenizer(content_prompt,return_tensors="pt",add_special_tokens=True)
    input_ids = inputs["input_ids"].to("cuda:0")
    # print(f"input length:{len(input_ids[0])}")
    with torch.no_grad():
        generation_output = model.generate(
                    input_ids=input_ids,
                    #generation_config=generation_config,
                    return_dict_in_generate=True,
                    output_scores=True,
                    max_new_tokens=2048,
                    top_p=0.9,
                    num_beams=1,
                    do_sample=True,
                    repetition_penalty=1.0,
                    eos_token_id=tokenizer.eos_token_id,
                    pad_token_id=tokenizer.pad_token_id,
                )
        s = generation_output.sequences[0]
        output = tokenizer.decode(s,skip_special_tokens=True)
        output1 = output.split("[/INST]")[-1].strip()
        # print(output1)
    return output1

predict(text)
output:你好!作为一个大型语言模型,我一直在学习和提高自己的能力。最近,我一直在努力学习新知识、改进算法,以便更好地回答用户的问题并提供帮助。同时,我也会定期接受人工智能专家的指导和评估,以确保我的表现不断提升。希望这些信息对你有所帮助!

vLLM server

#llama2-chat-template.jinja file is chat-template above
model_path=Mistral-7B-Instruct-V0.4
python  -m vllm.entrypoints.openai.api_server --model=$model_path \
        --trust-remote-code --host 0.0.0.0  --port 7777 \
        --gpu-memory-utilization 0.8 \
        --max-model-len 8192 --chat-template llama2-chat-template.jinja \
        --tensor-parallel-size 1 --served-model-name chatbot
from openai import OpenAI
# Set OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:7777/v1"

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)
call_args = {
        'temperature': 0.7,
        'top_p': 0.9,
        'top_k': 40,
        'max_tokens': 2048, # output-len
        'presence_penalty': 1.0,
        'frequency_penalty': 0.0,
        "repetition_penalty":1.0,
        "stop":["</s>"],
    }
chat_response = client.chat.completions.create(
    model="chatbot",
    messages=[
        {"role": "user", "content": "你好"},
    ],
    extra_body=call_args
)
print("Chat response:", chat_response)

Downloads last month
8
Safetensors
Model size
7.24B params
Tensor type
BF16
·
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.