self-chat / models /cpp_qwen2.py
xu song
update
5799733
raw
history blame
4.6 kB
"""
https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/local.py
https://github.com/awinml/llama-cpp-python-bindings
python convert_hf_to_gguf.py --outtype f16 Qwen1.5-0.5B-Chat
python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/
./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "I believe the meaning of life is" -n 128
./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -f prompt.txt -n 128
./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "You are a helpful assistant" -cnv
"""
import json
import copy
from simulator import Simulator
import llama_cpp
# import llama_cpp.llama_tokenizer
from transformers import AutoTokenizer
from utils.logging_util import logger
class Qwen2Simulator(Simulator):
def __init__(self, from_local=False):
if from_local:
self.hf_tokenizer = AutoTokenizer.from_pretrained(
"/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/")
self.llm = llama_cpp.Llama(
model_path="/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct-GGUF/qwen2-0_5b-instruct-fp16.gguf",
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
verbose=False,
)
else:
self.hf_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
self.llm = llama_cpp.Llama.from_pretrained(
repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
filename="*fp16.gguf",
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
verbose=False,
)
logger.info(f"llm has been initialized: {self.llm}")
self.generation_kwargs = dict(
temperature=5,
# top_p=0.1,
top_k=40,
max_tokens=20,
repeat_penalty=1.1,
# qwen2-0.5b-chat 有时内容生成结束没有<|im_end|>,直接跟 <|im_start|>
stop=[
"<|im_end|>",
"<|im_start|>",
"<|endoftext|>",
],
)
### local
def generate_query(self, messages, stream=True):
"""
:param messages:
:return:
"""
assert messages[-1]["role"] != "user"
logger.info(f"generating {json.dumps(messages)}")
inputs = self.hf_tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=False,
)
inputs = inputs + "<|im_start|>user\n"
if stream:
return self._stream_generate(inputs)
else:
return self._generate(inputs)
def generate_response(self, messages, stream=True):
assert messages[-1]["role"] == "user"
logger.info(f"generating {json.dumps(messages, ensure_ascii=False)}")
inputs = self.hf_tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
if stream:
return self._stream_generate(inputs)
else:
return self._generate(inputs)
def _generate(self, inputs):
"""
TODO: chat with cache.
"""
logger.info(f"generation_kwargs {self.generation_kwargs}")
output = self.llm(
inputs,
**self.generation_kwargs
)
output_text = output["choices"][0]["text"]
return output_text
def _stream_generate(self, inputs):
output = self.llm(
inputs,
stream=True,
**self.generation_kwargs
)
generated_text = ""
for out in output:
stream = copy.deepcopy(out)
generated_text += stream["choices"][0]["text"]
yield generated_text
bot = Qwen2Simulator()
if __name__ == "__main__":
# messages = [
# {"role": "system", "content": "you are a helpful assistant"},
# {"role": "user", "content": "What is the capital of France?"}
# ]
# output = bot.generate_response(messages)
# print(output)
messages = [
{"role": "system", "content": "you are a helpful assistant"},
{"role": "user", "content": "hi, what your name"},
{"role": "assistant", "content": "My name is Jordan"}
]
print(list(bot.generate_query(messages, stream=True)))
print(bot.generate_query(messages, stream=False))