|
""" |
|
https://github.com/abetlen/llama-cpp-python/blob/main/examples/gradio_chat/local.py |
|
https://github.com/awinml/llama-cpp-python-bindings |
|
|
|
python convert_hf_to_gguf.py --outtype f16 Qwen1.5-0.5B-Chat |
|
|
|
python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/ |
|
|
|
|
|
./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "I believe the meaning of life is" -n 128 |
|
|
|
./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -f prompt.txt -n 128 |
|
|
|
./llama-cli -m /workspace/xusong/huggingface/models/Qwen1.5-0.5B-Chat/Qwen1.5-0.5B-Chat-F16.gguf -p "You are a helpful assistant" -cnv |
|
|
|
""" |
|
|
|
import json |
|
import copy |
|
from simulator import Simulator |
|
import llama_cpp |
|
|
|
from transformers import AutoTokenizer |
|
from utils.logging_util import logger |
|
|
|
|
|
class Qwen2Simulator(Simulator): |
|
|
|
def __init__(self, from_local=False): |
|
if from_local: |
|
self.hf_tokenizer = AutoTokenizer.from_pretrained( |
|
"/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/") |
|
self.llm = llama_cpp.Llama( |
|
model_path="/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct-GGUF/qwen2-0_5b-instruct-fp16.gguf", |
|
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer), |
|
verbose=False, |
|
) |
|
else: |
|
self.hf_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct") |
|
self.llm = llama_cpp.Llama.from_pretrained( |
|
repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF", |
|
filename="*fp16.gguf", |
|
tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer), |
|
verbose=False, |
|
) |
|
logger.info(f"llm has been initialized: {self.llm}") |
|
|
|
self.generation_kwargs = dict( |
|
temperature=5, |
|
|
|
top_k=40, |
|
max_tokens=20, |
|
repeat_penalty=1.1, |
|
|
|
stop=[ |
|
"<|im_end|>", |
|
"<|im_start|>", |
|
"<|endoftext|>", |
|
], |
|
) |
|
|
|
|
|
def generate_query(self, messages, stream=True): |
|
""" |
|
:param messages: |
|
:return: |
|
""" |
|
assert messages[-1]["role"] != "user" |
|
logger.info(f"generating {json.dumps(messages)}") |
|
inputs = self.hf_tokenizer.apply_chat_template( |
|
messages, |
|
tokenize=False, |
|
add_generation_prompt=False, |
|
) |
|
inputs = inputs + "<|im_start|>user\n" |
|
if stream: |
|
return self._stream_generate(inputs) |
|
else: |
|
return self._generate(inputs) |
|
|
|
|
|
def generate_response(self, messages, stream=True): |
|
assert messages[-1]["role"] == "user" |
|
logger.info(f"generating {json.dumps(messages, ensure_ascii=False)}") |
|
inputs = self.hf_tokenizer.apply_chat_template( |
|
messages, |
|
tokenize=False, |
|
add_generation_prompt=True |
|
) |
|
if stream: |
|
return self._stream_generate(inputs) |
|
else: |
|
return self._generate(inputs) |
|
|
|
def _generate(self, inputs): |
|
""" |
|
TODO: chat with cache. |
|
|
|
""" |
|
logger.info(f"generation_kwargs {self.generation_kwargs}") |
|
output = self.llm( |
|
inputs, |
|
**self.generation_kwargs |
|
) |
|
output_text = output["choices"][0]["text"] |
|
return output_text |
|
|
|
def _stream_generate(self, inputs): |
|
output = self.llm( |
|
inputs, |
|
stream=True, |
|
**self.generation_kwargs |
|
) |
|
generated_text = "" |
|
for out in output: |
|
stream = copy.deepcopy(out) |
|
generated_text += stream["choices"][0]["text"] |
|
yield generated_text |
|
|
|
bot = Qwen2Simulator() |
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
messages = [ |
|
{"role": "system", "content": "you are a helpful assistant"}, |
|
{"role": "user", "content": "hi, what your name"}, |
|
{"role": "assistant", "content": "My name is Jordan"} |
|
] |
|
print(list(bot.generate_query(messages, stream=True))) |
|
print(bot.generate_query(messages, stream=False)) |
|
|