xu song commited on
Commit
b099d9e
1 Parent(s): b420ebd
Files changed (3) hide show
  1. app.py +14 -7
  2. app_util.py +16 -21
  3. models/cpp_qwen2.py +38 -49
app.py CHANGED
@@ -14,9 +14,15 @@ system_list = [
14
  ]
15
 
16
  user_simulator_doc = """\
 
 
 
17
  There are maily two types of user simulator:
18
  - prompt-based user-simulator (role-play)
19
  - model-based user-simulator
 
 
 
20
  """
21
 
22
  with gr.Blocks() as demo:
@@ -101,16 +107,17 @@ with gr.Blocks() as demo:
101
  )
102
 
103
  ########
104
- history = gr.State([{"role": "system", "content": system_list[0]}])
105
- system.change(reset_state, inputs=[system], outputs=[chatbot, history])
106
- clear_btn.click(reset_state, inputs=[system], outputs=[chatbot, history])
 
107
 
108
- generate_btn.click(generate, [chatbot, history], outputs=[generated_text_1, chatbot, history],
109
  show_progress="full")
110
- retry_btn.click(undo_generate, [chatbot, history], outputs=[generated_text_1, chatbot, history]) \
111
- .then(generate, [chatbot, history], outputs=[generated_text_1, chatbot, history],
112
  show_progress="full")
113
- undo_btn.click(undo_generate, [chatbot, history], outputs=[generated_text_1, chatbot, history])
114
 
115
  slider_max_tokens.change(set_max_tokens, inputs=[slider_max_tokens])
116
  slider_temperature.change(set_temperature, inputs=[slider_temperature])
 
14
  ]
15
 
16
  user_simulator_doc = """\
17
+
18
+
19
+
20
  There are maily two types of user simulator:
21
  - prompt-based user-simulator (role-play)
22
  - model-based user-simulator
23
+
24
+ In most cases, large language models (LLMs) are used to serve as assistant generator.
25
+ Besides, it can also used as user simulator.
26
  """
27
 
28
  with gr.Blocks() as demo:
 
107
  )
108
 
109
  ########
110
+ history = gr.State([{"role": "system", "content": system_list[0]}]) # 有用信息只有个system,其他和chatbot内容重叠
111
+ history_tokens = gr.State([])
112
+ system.change(reset_state, inputs=[system], outputs=[chatbot, history, history_tokens])
113
+ clear_btn.click(reset_state, inputs=[system], outputs=[chatbot, history, history_tokens])
114
 
115
+ generate_btn.click(generate, [chatbot, history, history_tokens], outputs=[generated_text_1, chatbot, history, history_tokens],
116
  show_progress="full")
117
+ retry_btn.click(undo_generate, [chatbot, history, history_tokens], outputs=[generated_text_1, chatbot, history, history_tokens]) \
118
+ .then(generate, [chatbot, history, history_tokens], outputs=[generated_text_1, chatbot, history, history_tokens],
119
  show_progress="full")
120
+ undo_btn.click(undo_generate, [chatbot, history, history_tokens], outputs=[generated_text_1, chatbot, history, history_tokens])
121
 
122
  slider_max_tokens.change(set_max_tokens, inputs=[slider_max_tokens])
123
  slider_temperature.change(set_temperature, inputs=[slider_temperature])
app_util.py CHANGED
@@ -19,54 +19,49 @@ from models.cpp_qwen2 import bot
19
  # gr.Chatbot.postprocess = postprocess
20
 
21
 
22
- def generate_query(chatbot, history):
23
  if history and history[-1]["role"] == "user":
24
  gr.Warning('You should generate assistant-response.')
25
  yield None, chatbot, history
26
  else:
27
  chatbot.append(None)
28
- streamer = bot.generate_query(history, stream=True)
29
- for query in streamer:
30
  chatbot[-1] = (query, None)
31
  yield query, chatbot, history
32
  history.append({"role": "user", "content": query})
33
- yield query, chatbot, history
34
 
35
 
36
- def generate_response(chatbot, history, user_input=None):
37
  """
38
  auto-mode:query is None
39
  manual-mode:query 是用户输入
40
- :param chatbot:
41
- :param history:
42
- :return:
43
  """
44
- if user_input and history[-1]["role"] != "user":
45
- history.append({"role": "user", "content": user_input})
46
  query = history[-1]["content"]
47
-
48
  if history[-1]["role"] != "user":
49
  gr.Warning('You should generate or type user-input first.')
50
  yield None, chatbot, history
51
  else:
52
- streamer = bot.generate_response(history, stream=True)
53
- for response in streamer:
54
  chatbot[-1] = (query, response)
55
- yield response, chatbot, history
56
 
57
  history.append({"role": "assistant", "content": response})
58
  print(f"chatbot is {chatbot}")
59
  print(f"history is {history}")
60
- yield response, chatbot, history
61
 
62
 
63
- def generate(chatbot, history):
64
  logger.info(f"chatbot: {chatbot}; history: {history}")
65
  streamer = None
66
  if history[-1]["role"] in ["assistant", "system"]:
67
- streamer = generate_query(chatbot, history)
68
  elif history[-1]["role"] == "user":
69
- streamer = generate_response(chatbot, history)
70
  else:
71
  gr.Warning("bug")
72
 
@@ -74,7 +69,7 @@ def generate(chatbot, history):
74
  yield out
75
 
76
 
77
- def undo_generate(chatbot, history):
78
  if history[-1]["role"] == "user":
79
  history = history[:-1]
80
  chatbot = chatbot[:-1]
@@ -84,7 +79,7 @@ def undo_generate(chatbot, history):
84
  else:
85
  pass
86
  logger.info(f"after undo, {json.dumps(chatbot, ensure_ascii=False)}, {json.dumps(history, ensure_ascii=False)}")
87
- return "", chatbot, history
88
 
89
 
90
  def reset_user_input():
@@ -92,7 +87,7 @@ def reset_user_input():
92
 
93
 
94
  def reset_state(system):
95
- return [], [{"role": "system", "content": system}]
96
 
97
 
98
  def set_max_tokens(max_tokens):
 
19
  # gr.Chatbot.postprocess = postprocess
20
 
21
 
22
+ def generate_query(chatbot, history, history_tokens):
23
  if history and history[-1]["role"] == "user":
24
  gr.Warning('You should generate assistant-response.')
25
  yield None, chatbot, history
26
  else:
27
  chatbot.append(None)
28
+ streamer = bot.generate_query(history[-1], history_tokens, stream=True)
29
+ for query, all_tokens in streamer:
30
  chatbot[-1] = (query, None)
31
  yield query, chatbot, history
32
  history.append({"role": "user", "content": query})
33
+ yield query, chatbot, history, all_tokens
34
 
35
 
36
+ def generate_response(chatbot, history, history_tokens):
37
  """
38
  auto-mode:query is None
39
  manual-mode:query 是用户输入
 
 
 
40
  """
41
+ logger.info(f"generating {json.dumps(history, ensure_ascii=False)}")
 
42
  query = history[-1]["content"]
 
43
  if history[-1]["role"] != "user":
44
  gr.Warning('You should generate or type user-input first.')
45
  yield None, chatbot, history
46
  else:
47
+ streamer = bot.generate_response(history[-1], history_tokens, stream=True)
48
+ for response, all_tokens in streamer:
49
  chatbot[-1] = (query, response)
50
+ yield response, chatbot, history, all_tokens
51
 
52
  history.append({"role": "assistant", "content": response})
53
  print(f"chatbot is {chatbot}")
54
  print(f"history is {history}")
55
+ yield response, chatbot, history, all_tokens
56
 
57
 
58
+ def generate(chatbot, history, history_tokens):
59
  logger.info(f"chatbot: {chatbot}; history: {history}")
60
  streamer = None
61
  if history[-1]["role"] in ["assistant", "system"]:
62
+ streamer = generate_query(chatbot, history, history_tokens)
63
  elif history[-1]["role"] == "user":
64
+ streamer = generate_response(chatbot, history, history_tokens)
65
  else:
66
  gr.Warning("bug")
67
 
 
69
  yield out
70
 
71
 
72
+ def undo_generate(chatbot, history, history_tokens):
73
  if history[-1]["role"] == "user":
74
  history = history[:-1]
75
  chatbot = chatbot[:-1]
 
79
  else:
80
  pass
81
  logger.info(f"after undo, {json.dumps(chatbot, ensure_ascii=False)}, {json.dumps(history, ensure_ascii=False)}")
82
+ return "", chatbot, history, history_tokens
83
 
84
 
85
  def reset_user_input():
 
87
 
88
 
89
  def reset_state(system):
90
+ return [], [{"role": "system", "content": system}], []
91
 
92
 
93
  def set_max_tokens(max_tokens):
models/cpp_qwen2.py CHANGED
@@ -36,14 +36,14 @@ import config
36
 
37
  class Qwen2Simulator(Simulator):
38
 
39
- def __init__(self, from_local=False):
40
- if from_local:
41
- self.hf_tokenizer = AutoTokenizer.from_pretrained(
42
- "/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/")
43
- self.llm = llama_cpp.Llama( # n_ctx, n_threads
44
 
45
- model_path="/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct-GGUF/qwen2-0_5b-instruct-fp16.gguf",
46
- tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
 
 
47
  n_ctx=config.MAX_SEQUENCE_LENGTH, #
48
  # n_threads=None, # 默认会根据cpu数来设置 n_threads
49
  use_mlock=True,
@@ -54,7 +54,6 @@ class Qwen2Simulator(Simulator):
54
  self.llm = llama_cpp.Llama.from_pretrained(
55
  repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
56
  filename="*fp16.gguf",
57
- tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer(self.hf_tokenizer),
58
  n_ctx=config.MAX_SEQUENCE_LENGTH,
59
  use_mlock=True,
60
  verbose=False,
@@ -77,68 +76,58 @@ class Qwen2Simulator(Simulator):
77
  ],
78
  )
79
 
80
- def generate_query(self, messages, stream=True):
 
 
 
81
  """
82
- :param messages:
83
- :return:
84
  """
85
- assert messages[-1]["role"] != "user"
86
- logger.info(f"generating {json.dumps(messages, ensure_ascii=False)}")
87
- inputs = self.hf_tokenizer.apply_chat_template(
88
- messages,
89
- tokenize=False,
90
- add_generation_prompt=False,
 
 
 
 
91
  )
92
- inputs = inputs + "<|im_start|>user\n"
93
  if stream:
94
- return self._stream_generate(inputs)
95
  else:
96
- return self._generate(inputs)
97
-
98
- def generate_response(self, messages, stream=True):
99
- assert messages[-1]["role"] == "user"
100
- logger.info(f"generating {json.dumps(messages, ensure_ascii=False)}")
101
- inputs = self.hf_tokenizer.apply_chat_template(
102
- messages,
103
- tokenize=False,
104
- add_generation_prompt=True
105
  )
106
  if stream:
107
- return self._stream_generate(inputs)
108
  else:
109
- return self._generate(inputs)
110
-
111
- def _generate(self, inputs):
112
- logger.info(f"generation_kwargs {self.generation_kwargs}")
113
- output = self.llm(
114
- inputs,
115
- **self.generation_kwargs
116
- )
117
- output_text = output["choices"][0]["text"]
118
- return output_text
119
 
120
- def _stream_generate(self, inputs):
121
  logger.info(f"generation_kwargs {self.generation_kwargs}")
122
 
123
  # self.llm.generate .set_cache .last_n_tokens_size .reset .ctx ._ctx
124
- output = self.llm(
125
- inputs,
126
  stream=True,
127
  **self.generation_kwargs
128
  )
129
-
130
-
131
  generated_text = ""
132
  # TODO: 检测finish reason,如果是length,则shift,并继续生成。
 
133
  for out in output:
134
  stream = copy.deepcopy(out)
135
- if stream["choices"][0]["finish_reason"] is not None:
136
  generated_text += stream["choices"][0]["text"]
137
- if "all_text" in stream["choices"][0]:
138
- yield stream["choices"][0]["all_text"]
139
  else:
140
- logger.info("all_text not found")
141
- yield generated_text
142
 
143
 
144
  bot = Qwen2Simulator()
 
36
 
37
  class Qwen2Simulator(Simulator):
38
 
39
+ def __init__(self):
40
+ self.hf_tokenizer = AutoTokenizer.from_pretrained(
41
+ "/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct/")
 
 
42
 
43
+ local_path = "/workspace/xusong/huggingface/models/Qwen2-0.5B-Instruct-GGUF/qwen2-0_5b-instruct-fp16.gguf"
44
+ if os.path.exists(local_path):
45
+ self.llm = llama_cpp.Llama( # n_ctx, n_threads
46
+ model_path=local_path,
47
  n_ctx=config.MAX_SEQUENCE_LENGTH, #
48
  # n_threads=None, # 默认会根据cpu数来设置 n_threads
49
  use_mlock=True,
 
54
  self.llm = llama_cpp.Llama.from_pretrained(
55
  repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
56
  filename="*fp16.gguf",
 
57
  n_ctx=config.MAX_SEQUENCE_LENGTH,
58
  use_mlock=True,
59
  verbose=False,
 
76
  ],
77
  )
78
 
79
+ def tokenize(self, text):
80
+ return self.llm.tokenize(text.encode("utf-8"))
81
+
82
+ def generate_query(self, message, history_tokens, stream=True):
83
  """
 
 
84
  """
85
+ # {% for message in messages %}
86
+ # {% if loop.first and messages[0]['role'] != 'system' %}
87
+ # {{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
88
+ # {% endif %}
89
+ # {{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}
90
+ # {% endfor %}
91
+ # {% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
92
+
93
+ input_ids = history_tokens + self.tokenize(
94
+ f"<|im_start|>{message['role']}\n{message['content']}<|im_end|>\n<|im_start|>user\n"
95
  )
 
96
  if stream:
97
+ return self._stream_generate(input_ids)
98
  else:
99
+ return self._generate(input_ids)
100
+
101
+ def generate_response(self, message, history_tokens, stream=True):
102
+ input_ids = history_tokens + self.tokenize(
103
+ f"<|im_start|>{message['role']}\n{message['content']}<|im_end|>\n<|im_start|>assistant\n"
 
 
 
 
104
  )
105
  if stream:
106
+ return self._stream_generate(input_ids)
107
  else:
108
+ return self._generate(input_ids)
 
 
 
 
 
 
 
 
 
109
 
110
+ def _stream_generate(self, input_ids):
111
  logger.info(f"generation_kwargs {self.generation_kwargs}")
112
 
113
  # self.llm.generate .set_cache .last_n_tokens_size .reset .ctx ._ctx
114
+ output = self.llm.create_completion(
115
+ input_ids,
116
  stream=True,
117
  **self.generation_kwargs
118
  )
 
 
119
  generated_text = ""
120
  # TODO: 检测finish reason,如果是length,则shift,并继续生成。
121
+ # TODO: 返回 token_id,
122
  for out in output:
123
  stream = copy.deepcopy(out)
124
+ if stream["choices"][0]["finish_reason"] is None:
125
  generated_text += stream["choices"][0]["text"]
126
+ if "completion_text" in stream["choices"][0]:
127
+ yield stream["choices"][0]["completion_text"], stream["choices"][0]["all_tokens"]
128
  else:
129
+ logger.info("completion_text not found")
130
+ yield generated_text, None
131
 
132
 
133
  bot = Qwen2Simulator()