xu song commited on
Commit
e74047c
1 Parent(s): 10e2ac5
Files changed (3) hide show
  1. app.py +14 -10
  2. models/cpp_qwen2.py +40 -15
  3. models/hf_qwen2.py +45 -30
app.py CHANGED
@@ -20,15 +20,10 @@ python moss_web_demo_gradio.py --model_name fnlp/moss-moon-003-sft --gpu 0,1,2,3
20
  -
21
  """
22
 
23
- from transformers.generation.utils import logger
24
 
25
  import gradio as gr
26
- import argparse
27
- import warnings
28
- import torch
29
- import os
30
- # from models.hf_qwen2 import bot
31
- from models.cpp_qwen2 import bot
32
 
33
 
34
  #
@@ -83,11 +78,18 @@ def generate_query(chatbot, history):
83
  if history and history[-1]["role"] == "user": # 该生成response了
84
  gr.Warning('You should generate assistant-response.')
85
  return None, chatbot, history
86
- query = bot.generate_query(history)
87
  # chatbot.append((query, ""))
 
 
 
 
 
 
 
88
  chatbot.append((query, None))
89
  history.append({"role": "user", "content": query})
90
- return query, chatbot, history
91
 
92
 
93
  def generate_response(query, chatbot, history):
@@ -200,6 +202,8 @@ with gr.Blocks() as demo:
200
  generate_query_btn.click(generate_query, [chatbot, history], outputs=[user_input, chatbot, history],
201
  show_progress="full")
202
 
 
 
203
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
204
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature",
205
  info="Larger temperature increase the randomness"),
@@ -211,5 +215,5 @@ with gr.Blocks() as demo:
211
  label="Top-p (nucleus sampling)",
212
  ),
213
 
214
- demo.queue().launch(share=False)
215
  # demo.queue().launch(share=True)
 
20
  -
21
  """
22
 
 
23
 
24
  import gradio as gr
25
+ from models.hf_qwen2 import bot
26
+ # from models.cpp_qwen2 import bot
 
 
 
 
27
 
28
 
29
  #
 
78
  if history and history[-1]["role"] == "user": # 该生成response了
79
  gr.Warning('You should generate assistant-response.')
80
  return None, chatbot, history
81
+ streamer = bot.generate_query(history)
82
  # chatbot.append((query, ""))
83
+
84
+ query = ""
85
+ for new_text in streamer:
86
+ print(new_text)
87
+ query += new_text
88
+ yield query, chatbot, history
89
+
90
  chatbot.append((query, None))
91
  history.append({"role": "user", "content": query})
92
+ yield query, chatbot, history
93
 
94
 
95
  def generate_response(query, chatbot, history):
 
202
  generate_query_btn.click(generate_query, [chatbot, history], outputs=[user_input, chatbot, history],
203
  show_progress="full")
204
 
205
+ # generate_query_btn.
206
+
207
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
208
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature",
209
  info="Larger temperature increase the randomness"),
 
215
  label="Top-p (nucleus sampling)",
216
  ),
217
 
218
+ demo.queue().launch(share=False, server_name="0.0.0.0")
219
  # demo.queue().launch(share=True)
models/cpp_qwen2.py CHANGED
@@ -16,6 +16,7 @@ python convert_hf_to_gguf.py /workspace/xusong/huggingface/models/Qwen1.5-0.5B-C
16
  """
17
 
18
  import json
 
19
  from simulator import Simulator
20
  import llama_cpp
21
  # import llama_cpp.llama_tokenizer
@@ -45,9 +46,22 @@ class Qwen2Simulator(Simulator):
45
  logger.info(f"llm has been initialized: {self.llm}")
46
  # warmup
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  ### local
49
 
50
- def generate_query(self, messages):
51
  """
52
  :param messages:
53
  :return:
@@ -60,11 +74,13 @@ class Qwen2Simulator(Simulator):
60
  add_generation_prompt=False,
61
  )
62
  inputs = inputs + "<|im_start|>user\n"
63
- return self._generate(inputs)
64
- # for new_text in self._stream_generate(input_ids):
65
- # yield new_text
 
 
66
 
67
- def generate_response(self, messages):
68
  assert messages[-1]["role"] == "user"
69
  logger.info(f"generating {json.dumps(messages)}")
70
  inputs = self.hf_tokenizer.apply_chat_template(
@@ -72,13 +88,14 @@ class Qwen2Simulator(Simulator):
72
  tokenize=False,
73
  add_generation_prompt=True
74
  )
75
-
76
- return self._generate(inputs)
77
- # for new_text in self._stream_generate(input_ids):
78
- # yield new_text
79
 
80
  def _generate(self, inputs):
81
  """
 
82
  qwen2-0.5b-chat 有bug:有时user生成结束没有<|im_end|>,示例:
83
  <|im_start|>system
84
  you are a helpful assistant<|im_end|>
@@ -91,16 +108,24 @@ class Qwen2Simulator(Simulator):
91
  <|im_start|>assistant
92
  I am a 41-year-old man.<|im_end|>
93
  """
94
- # stream=False
95
  output = self.llm(
96
  inputs,
97
- max_tokens=20,
98
- temperature=5,
99
- stop=["<|im_end|>", "<|im_start|>"]
100
  )
101
  output_text = output["choices"][0]["text"]
102
  return output_text
103
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  bot = Qwen2Simulator()
106
 
@@ -117,5 +142,5 @@ if __name__ == "__main__":
117
  {"role": "user", "content": "hi, what your name"},
118
  {"role": "assistant", "content": "My name is Jordan"}
119
  ]
120
- output = bot.generate_query(messages)
121
- print(output)
 
16
  """
17
 
18
  import json
19
+ import copy
20
  from simulator import Simulator
21
  import llama_cpp
22
  # import llama_cpp.llama_tokenizer
 
46
  logger.info(f"llm has been initialized: {self.llm}")
47
  # warmup
48
 
49
+
50
+ self.generation_kwargs = dict(
51
+ temperature=5,
52
+ # top_p=0.1,
53
+ top_k=40,
54
+ max_tokens=20,
55
+ repeat_penalty=1.1,
56
+ stop=[
57
+ "<|im_end|>",
58
+ "<|im_start|>",
59
+ "<|endoftext|>",
60
+ ],
61
+ )
62
  ### local
63
 
64
+ def generate_query(self, messages, stream=True):
65
  """
66
  :param messages:
67
  :return:
 
74
  add_generation_prompt=False,
75
  )
76
  inputs = inputs + "<|im_start|>user\n"
77
+ if stream:
78
+ return self._stream_generate(inputs)
79
+ else:
80
+ return self._generate(inputs)
81
+
82
 
83
+ def generate_response(self, messages, stream=True):
84
  assert messages[-1]["role"] == "user"
85
  logger.info(f"generating {json.dumps(messages)}")
86
  inputs = self.hf_tokenizer.apply_chat_template(
 
88
  tokenize=False,
89
  add_generation_prompt=True
90
  )
91
+ if stream:
92
+ return self._stream_generate(inputs)
93
+ else:
94
+ return self._generate(inputs)
95
 
96
  def _generate(self, inputs):
97
  """
98
+ TODO: chat with cache.
99
  qwen2-0.5b-chat 有bug:有时user生成结束没有<|im_end|>,示例:
100
  <|im_start|>system
101
  you are a helpful assistant<|im_end|>
 
108
  <|im_start|>assistant
109
  I am a 41-year-old man.<|im_end|>
110
  """
 
111
  output = self.llm(
112
  inputs,
113
+ **self.generation_kwargs
 
 
114
  )
115
  output_text = output["choices"][0]["text"]
116
  return output_text
117
 
118
+ def _stream_generate(self, inputs):
119
+ output = self.llm(
120
+ inputs,
121
+ stream=True,
122
+ **self.generation_kwargs
123
+ )
124
+ generated_text = ""
125
+ for out in output:
126
+ stream = copy.deepcopy(out)
127
+ generated_text += stream["choices"][0]["text"]
128
+ yield generated_text
129
 
130
  bot = Qwen2Simulator()
131
 
 
142
  {"role": "user", "content": "hi, what your name"},
143
  {"role": "assistant", "content": "My name is Jordan"}
144
  ]
145
+ print(list(bot.generate_query(messages, stream=True)))
146
+ print(bot.generate_query(messages, stream=False))
models/hf_qwen2.py CHANGED
@@ -26,10 +26,10 @@ class Qwen2Simulator(Simulator):
26
  temperature=0.7,
27
  # repetition_penalty=
28
  max_length=500,
29
- max_new_tokens=200
30
  )
31
 
32
- def generate_query(self, messages):
33
  """
34
  :param messages:
35
  :return:
@@ -42,11 +42,23 @@ class Qwen2Simulator(Simulator):
42
  )
43
  inputs = inputs + "<|im_start|>user\n"
44
  input_ids = self.tokenizer.encode(inputs, return_tensors="pt").to(self.model.device)
45
- return self._generate(input_ids)
46
- # for new_text in self._stream_generate(input_ids):
47
- # yield new_text
48
 
49
- def generate_response(self, messages):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  assert messages[-1]["role"] == "user"
51
  input_ids = self.tokenizer.apply_chat_template(
52
  messages,
@@ -54,46 +66,49 @@ class Qwen2Simulator(Simulator):
54
  return_tensors="pt",
55
  add_generation_prompt=True
56
  ).to(self.model.device)
57
- return self._generate(input_ids)
58
- # for new_text in self._stream_generate(input_ids):
59
- # yield new_text
60
 
61
- def _generate(self, input_ids):
62
- input_ids_length = input_ids.shape[-1]
63
- response = self.model.generate(input_ids=input_ids, **self.generation_kwargs)
64
- return self.tokenizer.decode(response[0][input_ids_length:], skip_special_tokens=True)
65
-
66
- def _stream_generate(self, input_ids):
67
- streamer = TextIteratorStreamer(tokenizer=self.tokenizer, skip_prompt=True, timeout=60.0,
68
- skip_special_tokens=True)
69
 
70
- stream_generation_kwargs = dict(
71
  input_ids=input_ids,
72
  streamer=streamer
73
  ).update(self.generation_kwargs)
74
- thread = Thread(target=self.model.generate, kwargs=stream_generation_kwargs)
 
 
75
  thread.start()
76
 
77
  for new_text in streamer:
 
78
  yield new_text
79
 
 
 
 
 
 
80
 
81
  bot = Qwen2Simulator(r"E:\data_model\Qwen2-0.5B-Instruct")
82
  # bot = Qwen2Simulator("Qwen/Qwen2-0.5B-Instruct")
83
 
84
 
85
  if __name__ == "__main__":
86
- # messages = [
87
- # {"role": "system", "content": "you are a helpful assistant"},
88
- # {"role": "user", "content": "hi, what your name"}
89
- # ]
90
- # output = bot.generate_response(messages)
91
- # print(output)
92
-
93
  messages = [
94
  {"role": "system", "content": "you are a helpful assistant"},
95
- {"role": "user", "content": "hi, what your name"},
96
- {"role": "assistant", "content": "My name is Jordan"}
97
  ]
98
- output = bot.generate_query(messages)
99
- print(output)
 
 
 
 
 
 
 
 
 
26
  temperature=0.7,
27
  # repetition_penalty=
28
  max_length=500,
29
+ max_new_tokens=20
30
  )
31
 
32
+ def generate_query(self, messages, stream=True):
33
  """
34
  :param messages:
35
  :return:
 
42
  )
43
  inputs = inputs + "<|im_start|>user\n"
44
  input_ids = self.tokenizer.encode(inputs, return_tensors="pt").to(self.model.device)
 
 
 
45
 
46
+ streamer = TextIteratorStreamer(tokenizer=self.tokenizer, skip_prompt=True, timeout=120.0,
47
+ skip_special_tokens=True)
48
+
49
+ stream_generation_kwargs = dict(
50
+ input_ids=input_ids,
51
+ streamer=streamer
52
+ ).update(self.generation_kwargs)
53
+ thread = Thread(target=self.model.generate, kwargs=stream_generation_kwargs)
54
+ thread.start()
55
+
56
+ for new_text in streamer:
57
+ print(new_text)
58
+ yield new_text
59
+ # return self._generate(input_ids)
60
+
61
+ def generate_response(self, messages, stream=True):
62
  assert messages[-1]["role"] == "user"
63
  input_ids = self.tokenizer.apply_chat_template(
64
  messages,
 
66
  return_tensors="pt",
67
  add_generation_prompt=True
68
  ).to(self.model.device)
 
 
 
69
 
70
+ streamer = TextIteratorStreamer(
71
+ tokenizer=self.tokenizer,
72
+ # skip_prompt=True,
73
+ # timeout=120.0,
74
+ # skip_special_tokens=True
75
+ )
 
 
76
 
77
+ generation_kwargs = dict(
78
  input_ids=input_ids,
79
  streamer=streamer
80
  ).update(self.generation_kwargs)
81
+ print(generation_kwargs)
82
+
83
+ thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
84
  thread.start()
85
 
86
  for new_text in streamer:
87
+ print(new_text)
88
  yield new_text
89
 
90
+ def _generate(self, input_ids):
91
+ input_ids_length = input_ids.shape[-1]
92
+ response = self.model.generate(input_ids=input_ids, **self.generation_kwargs)
93
+ return self.tokenizer.decode(response[0][input_ids_length:], skip_special_tokens=True)
94
+
95
 
96
  bot = Qwen2Simulator(r"E:\data_model\Qwen2-0.5B-Instruct")
97
  # bot = Qwen2Simulator("Qwen/Qwen2-0.5B-Instruct")
98
 
99
 
100
  if __name__ == "__main__":
 
 
 
 
 
 
 
101
  messages = [
102
  {"role": "system", "content": "you are a helpful assistant"},
103
+ {"role": "user", "content": "hi, what your name"}
 
104
  ]
105
+ streamer = bot.generate_response(messages)
106
+ # print(output)
107
+
108
+ # messages = [
109
+ # {"role": "system", "content": "you are a helpful assistant"},
110
+ # {"role": "user", "content": "hi, what your name"},
111
+ # {"role": "assistant", "content": "My name is Jordan"}
112
+ # ]
113
+ # streamer = bot.generate_query(messages)
114
+ print(list(streamer))