yangapku commited on
Commit
583aaa0
1 Parent(s): 7bd1639

fix kwargs in generate method and update readme

Browse files
Files changed (2) hide show
  1. README.md +30 -9
  2. modeling_qwen.py +10 -6
README.md CHANGED
@@ -42,9 +42,21 @@ The features of Qwen-7B include:
42
 
43
  For more details about the open-source model of Qwen-7B, please refer to the [Github](https://github.com/QwenLM/Qwen-7B) code repository.
44
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  ## 依赖项 (Dependency)
46
 
47
- 运行Qwen-7B,请确保机器环境torch版本不低于1.12,再执行以下pip命令安装依赖库
48
 
49
  To run Qwen-7B, please make sure that pytorch version is not lower than 1.12, and then execute the following pip commands to install the dependent libraries.
50
 
@@ -75,18 +87,18 @@ from transformers.generation import GenerationConfig
75
 
76
  # Note: The default behavior now has injection attack prevention off.
77
  tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
78
- # We recommend checking the support of BF16 first. Run the command below:
79
- # import torch
80
- # torch.cuda.is_bf16_supported()
81
  # use bf16
82
  # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="auto", trust_remote_code=True, bf16=True).eval()
83
  # use fp16
84
  # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="auto", trust_remote_code=True, fp16=True).eval()
85
  # use cpu only
86
  # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="cpu", trust_remote_code=True).eval()
87
- # use fp32
88
  model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="auto", trust_remote_code=True).eval()
89
- model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参
 
 
90
 
91
  inputs = tokenizer('蒙古国的首都是乌兰巴托(Ulaanbaatar)\n冰岛的首都是雷克雅未克(Reykjavik)\n埃塞俄比亚的首都是', return_tensors='pt')
92
  inputs = inputs.to('cuda:0')
@@ -309,9 +321,17 @@ We introduce NTK-aware interpolation, LogN attention scaling, Window attention,
309
 
310
  ## 量化(Quantization)
311
 
312
- 如希望使用更低精度的量化模型,如4比特和8比特的模型,我们提供了简单的示例来说明如何快速使用量化模型。在开始前,确保你已经安装了`bitsandbytes`。
313
 
314
- We provide examples to show how to load models in `NF4` and `Int8`. For starters, make sure you have implemented `bitsandbytes`.
 
 
 
 
 
 
 
 
315
 
316
  ```bash
317
  pip install bitsandbytes
@@ -369,4 +389,5 @@ Our code and checkpoints are open to research purpose, and they are allowed for
369
 
370
  如果你想给我们的研发团队和产品团队留言,请通过邮件([email protected])联系我们。
371
 
372
- If you are interested to leave a message to either our research team or product team, feel free to send an email to [email protected].
 
 
42
 
43
  For more details about the open-source model of Qwen-7B, please refer to the [Github](https://github.com/QwenLM/Qwen-7B) code repository.
44
 
45
+ ## 要求(Requirements)
46
+
47
+ * python 3.8及以上版本
48
+ * pytorch 1.12及以上版本,推荐2.0及以上版本
49
+ * 建议使用CUDA 11.4及以上(GPU用户、flash-attention用户等需考虑此选项)
50
+
51
+
52
+
53
+ * python 3.8 and above
54
+ * pytorch 1.12 and above, 2.0 and above are recommended
55
+ * CUDA 11.4 and above are recommended (this is for GPU users, flash-attention users, etc.)
56
+
57
  ## 依赖项 (Dependency)
58
 
59
+ 运行Qwen-7B,请确保满足上述要求,再执行以下pip命令安装依赖库
60
 
61
  To run Qwen-7B, please make sure that pytorch version is not lower than 1.12, and then execute the following pip commands to install the dependent libraries.
62
 
 
87
 
88
  # Note: The default behavior now has injection attack prevention off.
89
  tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
90
+
 
 
91
  # use bf16
92
  # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="auto", trust_remote_code=True, bf16=True).eval()
93
  # use fp16
94
  # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="auto", trust_remote_code=True, fp16=True).eval()
95
  # use cpu only
96
  # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="cpu", trust_remote_code=True).eval()
97
+ # use auto mode, automatically select precision based on the device.
98
  model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="auto", trust_remote_code=True).eval()
99
+
100
+ # Specify hyperparameters for generation
101
+ model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True)
102
 
103
  inputs = tokenizer('蒙古国的首都是乌兰巴托(Ulaanbaatar)\n冰岛的首都是雷克雅未克(Reykjavik)\n埃塞俄比亚的首都是', return_tensors='pt')
104
  inputs = inputs.to('cuda:0')
 
321
 
322
  ## 量化(Quantization)
323
 
324
+ 如希望使用更低精度的量化模型,如4比特和8比特的模型,我们提供了简单的示例来说明如何快速使用量化模型。在开始前,确保你已经安装了`bitsandbytes`。请注意:`bitsandbytes`的安装要求是:
325
 
326
+ We provide examples to show how to load models in `NF4` and `Int8`. For starters, make sure you have implemented `bitsandbytes`. Note that the requirements for `bitsandbytes` is:
327
+
328
+ ```
329
+ **Requirements** Python >=3.8. Linux distribution (Ubuntu, MacOS, etc.) + CUDA > 10.0.
330
+ ```
331
+
332
+ Windows用户需安装特定版本的`bitsandbytes`,可选项包括[bitsandbytes-windows-webui](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels)。
333
+
334
+ Windows users should find another option, which might be [bitsandbytes-windows-webui](https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels).
335
 
336
  ```bash
337
  pip install bitsandbytes
 
389
 
390
  如果你想给我们的研发团队和产品团队留言,请通过邮件([email protected])联系我们。
391
 
392
+ If you are interested to leave a message to either our research team or product team, feel free to send an email to [email protected].
393
+
modeling_qwen.py CHANGED
@@ -958,12 +958,14 @@ class QWenLMHeadModel(QWenPreTrainedModel):
958
  history: Optional[HistoryType],
959
  system: str = "You are a helpful assistant.",
960
  append_history: bool = True,
961
- stream: Optional[bool] = False
 
 
962
  ) -> Tuple[str, HistoryType]:
963
-
964
-
965
  if history is None:
966
  history = []
 
 
967
 
968
  raw_text, context_tokens = make_context(
969
  tokenizer,
@@ -974,9 +976,9 @@ class QWenLMHeadModel(QWenPreTrainedModel):
974
  chat_format=self.generation_config.chat_format,
975
  )
976
 
977
- stop_words_ids = get_stop_words_ids(
978
  self.generation_config.chat_format, tokenizer
979
- )
980
  input_ids = torch.tensor([context_tokens]).to(self.device)
981
  if stream:
982
  assert self.generation_config.chat_format == 'chatml'
@@ -986,7 +988,8 @@ class QWenLMHeadModel(QWenPreTrainedModel):
986
  stream_config = StreamGenerationConfig(**self.generation_config.to_dict(), do_stream=True)
987
  def stream_generator():
988
  outputs = []
989
- for token in self.generate(input_ids, return_dict_in_generate=False, generation_config=stream_config):
 
990
  outputs.append(token.item())
991
  if outputs[-1] in (tokenizer.im_end_id, tokenizer.im_start_id):
992
  break
@@ -998,6 +1001,7 @@ class QWenLMHeadModel(QWenPreTrainedModel):
998
  input_ids,
999
  stop_words_ids = stop_words_ids,
1000
  return_dict_in_generate = False,
 
1001
  )
1002
 
1003
  response = decode_tokens(
 
958
  history: Optional[HistoryType],
959
  system: str = "You are a helpful assistant.",
960
  append_history: bool = True,
961
+ stream: Optional[bool] = False,
962
+ stop_words_ids: Optional[List[List[int]]] = None,
963
+ **kwargs,
964
  ) -> Tuple[str, HistoryType]:
 
 
965
  if history is None:
966
  history = []
967
+ if stop_words_ids is None:
968
+ stop_words_ids = []
969
 
970
  raw_text, context_tokens = make_context(
971
  tokenizer,
 
976
  chat_format=self.generation_config.chat_format,
977
  )
978
 
979
+ stop_words_ids.extend(get_stop_words_ids(
980
  self.generation_config.chat_format, tokenizer
981
+ ))
982
  input_ids = torch.tensor([context_tokens]).to(self.device)
983
  if stream:
984
  assert self.generation_config.chat_format == 'chatml'
 
988
  stream_config = StreamGenerationConfig(**self.generation_config.to_dict(), do_stream=True)
989
  def stream_generator():
990
  outputs = []
991
+ for token in self.generate(
992
+ input_ids, return_dict_in_generate=False, generation_config=stream_config, **kwargs):
993
  outputs.append(token.item())
994
  if outputs[-1] in (tokenizer.im_end_id, tokenizer.im_start_id):
995
  break
 
1001
  input_ids,
1002
  stop_words_ids = stop_words_ids,
1003
  return_dict_in_generate = False,
1004
+ **kwargs,
1005
  )
1006
 
1007
  response = decode_tokens(