""" https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/local.py https://github.com/awinml/llama-cpp-python-bindings python convert_hf_to_gguf.py --outtype f16 Qwen1.5-0.5B-Chat python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/ ./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "I believe the meaning of life is" -n 128 ./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -f prompt.txt -n 128 ./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "You are a helpful assistant" -cnv """ import json import copy from simulator import Simulator import llama_cpp # import llama_cpp.llama_tokenizer from transformers import AutoTokenizer from utils.logging_util import logger class Qwen2Simulator(Simulator): def __init__(self, from_local=False): if from_local: self.hf_tokenizer = AutoTokenizer.from_pretrained( "/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/") self.llm = llama_cpp.Llama( model_path="/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct-GGUF/qwen2-0_5b-instruct-fp16.gguf", tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer), verbose=False, ) else: self.hf_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct") self.llm = llama_cpp.Llama.from_pretrained( repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF", filename="*fp16.gguf", tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer), verbose=False, ) logger.info(f"llm has been initialized: {self.llm}") self.generation_kwargs = dict( temperature=5, # top_p=0.1, top_k=40, max_tokens=20, repeat_penalty=1.1, # qwen2-0.5b-chat 有时内容生成结束没有<|im_end|>,直接跟 <|im_start|> stop=[ "<|im_end|>", "<|im_start|>", "<|endoftext|>", ], ) ### local def generate_query(self, messages, stream=True): """ :param messages: :return: """ assert messages[-1]["role"] != "user" logger.info(f"generating {json.dumps(messages)}") inputs = self.hf_tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=False, ) inputs = inputs + "<|im_start|>user\n" if stream: return self._stream_generate(inputs) else: return self._generate(inputs) def generate_response(self, messages, stream=True): assert messages[-1]["role"] == "user" logger.info(f"generating {json.dumps(messages, ensure_ascii=False)}") inputs = self.hf_tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) if stream: return self._stream_generate(inputs) else: return self._generate(inputs) def _generate(self, inputs): """ TODO: chat with cache. """ logger.info(f"generation_kwargs {self.generation_kwargs}") output = self.llm( inputs, **self.generation_kwargs ) output_text = output["choices"][0]["text"] return output_text def _stream_generate(self, inputs): output = self.llm( inputs, stream=True, **self.generation_kwargs ) generated_text = "" for out in output: stream = copy.deepcopy(out) generated_text += stream["choices"][0]["text"] yield generated_text bot = Qwen2Simulator() if __name__ == "__main__": # messages = [ # {"role": "system", "content": "you are a helpful assistant"}, # {"role": "user", "content": "What is the capital of France?"} # ] # output = bot.generate_response(messages) # print(output) messages = [ {"role": "system", "content": "you are a helpful assistant"}, {"role": "user", "content": "hi, what your name"}, {"role": "assistant", "content": "My name is Jordan"} ] print(list(bot.generate_query(messages, stream=True))) print(bot.generate_query(messages, stream=False))