jeffreymeetkai commited on
Commit
3db8af6
1 Parent(s): b63d65d

fix tokenization

Browse files
tokenization_functionary.py CHANGED
@@ -38,10 +38,8 @@ class Tool(BaseModel):
38
 
39
  def convert_data_type(param_type: str) -> str:
40
  """convert data_type to typescript data type
41
-
42
  Args:
43
  param_type (str): param_type
44
-
45
  Returns:
46
  str: param type in typescript
47
  """
@@ -52,10 +50,8 @@ def convert_data_type(param_type: str) -> str:
52
 
53
  def get_param_type(param: Dict) -> str:
54
  """get param_type of parameter
55
-
56
  Args:
57
  param (Dict): param dict in properties
58
-
59
  Returns:
60
  str: _description_
61
  """
@@ -80,10 +76,8 @@ def get_param_type(param: Dict) -> str:
80
 
81
  def get_format_param(param: Dict) -> Optional[str]:
82
  """Get "format" from param. There are cases where format is not directly in param but in oneOf
83
-
84
  Args:
85
  param (Dict): _description_
86
-
87
  Returns:
88
  Optional[str]: _description_
89
  """
@@ -101,10 +95,8 @@ def get_format_param(param: Dict) -> Optional[str]:
101
 
102
  def get_param_info(param: Dict) -> Optional[str]:
103
  """get additional information about parameter such as: format, default value, min, max, ...
104
-
105
  Args:
106
  param (Dict): _description_
107
-
108
  Returns:
109
  Optional[str]: _description_
110
  """
@@ -150,7 +142,6 @@ def append_new_param_info(
150
  depth: int,
151
  ):
152
  """Append a new parameter with comment to the info_list
153
-
154
  Args:
155
  info_lines (List[str]): current info_list
156
  param_declaration (str): param: type
@@ -176,11 +167,9 @@ def append_new_param_info(
176
 
177
  def get_examples_info(param_name: str, examples: List) -> List:
178
  """get information about examples provided
179
-
180
  Args:
181
  param_name (str): _description_
182
  examples (List): _description_
183
-
184
  Returns:
185
  List: _description_
186
  """
@@ -197,10 +186,8 @@ def get_examples_info(param_name: str, examples: List) -> List:
197
 
198
  def get_enum_option_str(enum_options: List) -> str:
199
  """get enum option separated by: "|"
200
-
201
  Args:
202
  enum_options (List): list of options
203
-
204
  Returns:
205
  _type_: concatenation of options separated by "|"
206
  """
@@ -212,12 +199,10 @@ def get_array_typescript(
212
  param_name: Optional[str], param_dic: dict, depth: int = 0
213
  ) -> str:
214
  """recursive implementation for generating type script of array
215
-
216
  Args:
217
  param_name (Optional[str]): name of param, optional
218
  param_dic (dict): param_dic
219
  depth (int, optional): nested level. Defaults to 0.
220
-
221
  Returns:
222
  _type_: typescript of array
223
  """
@@ -270,12 +255,10 @@ def get_array_typescript(
270
  def get_parameter_typescript(properties, required_params, depth=0) -> List[str]:
271
  """Recursion, returning the information about parameters including data type, description and other information
272
  These kinds of information will be put into the prompt
273
-
274
  Args:
275
  properties (_type_): properties in parameters
276
  required_params (_type_): List of required parameters
277
  depth (int, optional): the depth of params (nested level). Defaults to 0.
278
-
279
  Returns:
280
  _type_: list of lines containing information about all parameters
281
  """
@@ -461,20 +444,41 @@ class FunctionaryTokenizer(PreTrainedTokenizerFast):
461
  "point any code depending on them will stop working. We recommend setting a valid chat template before "
462
  "then to ensure that this model continues working without issues."
463
  )
464
-
 
 
 
 
 
 
 
 
 
 
 
465
  # Prepare tools/functions into schema
466
  functions_pydantic_to_render = []
467
  has_code_interpreter = False
468
- for i in range(len(tools)):
469
- tool_pydantic = Tool.model_validate(tools[i])
470
- if tool_pydantic.type == "function":
471
- functions_pydantic_to_render.append(tool_pydantic.function)
472
- else:
473
- has_code_interpreter = True
474
- conversation.insert(0, {"role": "system", "content": generate_schema_from_functions(functions_pydantic_to_render)})
475
- # Insert system prompt
476
- system_prompt_to_use = SYSTEM_PROMPT if not has_code_interpreter else CODE_INTERPRETER_SYSTEM_PROMPT
477
- conversation.insert(1, {"role": "system", "content": system_prompt_to_use})
 
 
 
 
 
 
 
 
 
 
478
 
479
  # Compilation function uses a cache to avoid recompiling the same template
480
  compiled_template = self._compile_jinja_template(chat_template)
 
38
 
39
  def convert_data_type(param_type: str) -> str:
40
  """convert data_type to typescript data type
 
41
  Args:
42
  param_type (str): param_type
 
43
  Returns:
44
  str: param type in typescript
45
  """
 
50
 
51
  def get_param_type(param: Dict) -> str:
52
  """get param_type of parameter
 
53
  Args:
54
  param (Dict): param dict in properties
 
55
  Returns:
56
  str: _description_
57
  """
 
76
 
77
  def get_format_param(param: Dict) -> Optional[str]:
78
  """Get "format" from param. There are cases where format is not directly in param but in oneOf
 
79
  Args:
80
  param (Dict): _description_
 
81
  Returns:
82
  Optional[str]: _description_
83
  """
 
95
 
96
  def get_param_info(param: Dict) -> Optional[str]:
97
  """get additional information about parameter such as: format, default value, min, max, ...
 
98
  Args:
99
  param (Dict): _description_
 
100
  Returns:
101
  Optional[str]: _description_
102
  """
 
142
  depth: int,
143
  ):
144
  """Append a new parameter with comment to the info_list
 
145
  Args:
146
  info_lines (List[str]): current info_list
147
  param_declaration (str): param: type
 
167
 
168
  def get_examples_info(param_name: str, examples: List) -> List:
169
  """get information about examples provided
 
170
  Args:
171
  param_name (str): _description_
172
  examples (List): _description_
 
173
  Returns:
174
  List: _description_
175
  """
 
186
 
187
  def get_enum_option_str(enum_options: List) -> str:
188
  """get enum option separated by: "|"
 
189
  Args:
190
  enum_options (List): list of options
 
191
  Returns:
192
  _type_: concatenation of options separated by "|"
193
  """
 
199
  param_name: Optional[str], param_dic: dict, depth: int = 0
200
  ) -> str:
201
  """recursive implementation for generating type script of array
 
202
  Args:
203
  param_name (Optional[str]): name of param, optional
204
  param_dic (dict): param_dic
205
  depth (int, optional): nested level. Defaults to 0.
 
206
  Returns:
207
  _type_: typescript of array
208
  """
 
255
  def get_parameter_typescript(properties, required_params, depth=0) -> List[str]:
256
  """Recursion, returning the information about parameters including data type, description and other information
257
  These kinds of information will be put into the prompt
 
258
  Args:
259
  properties (_type_): properties in parameters
260
  required_params (_type_): List of required parameters
261
  depth (int, optional): the depth of params (nested level). Defaults to 0.
 
262
  Returns:
263
  _type_: list of lines containing information about all parameters
264
  """
 
444
  "point any code depending on them will stop working. We recommend setting a valid chat template before "
445
  "then to ensure that this model continues working without issues."
446
  )
447
+
448
+ PYTHON_RUN_SYS_MSG = "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 60.0 seconds. The drive at '/mnt/data' can be used to save and persist user files."
449
+ SYSTEM_CONTENT = """You are capable of executing available function(s) if required.
450
+ Only execute function(s) when absolutely necessary.
451
+ Ask for the required input to:recipient==all
452
+ Use JSON for function arguments.
453
+ Respond in this format:
454
+ >>>${recipient}
455
+ ${content}
456
+ Available functions:
457
+ """
458
+
459
  # Prepare tools/functions into schema
460
  functions_pydantic_to_render = []
461
  has_code_interpreter = False
462
+ if tools is not None:
463
+ for item in tools:
464
+ if (
465
+ "function" in item and item["function"] is not None
466
+ ): # new data format: tools: [{"type": xx, "function": xxx}]
467
+ functions_pydantic_to_render.append(item["function"])
468
+ elif "type" in item and item["type"] == "code_interpreter":
469
+ has_code_interpreter = True
470
+ else:
471
+ functions_pydantic_to_render.append(item) # old format
472
+
473
+ conversation.insert(
474
+ 0,
475
+ {
476
+ "role": "system",
477
+ "content": SYSTEM_CONTENT + generate_schema_from_functions(functions_pydantic_to_render),
478
+ },
479
+ )
480
+ if has_code_interpreter:
481
+ conversation.insert(1, {"role": "system", "content": PYTHON_RUN_SYS_MSG})
482
 
483
  # Compilation function uses a cache to avoid recompiling the same template
484
  compiled_template = self._compile_jinja_template(chat_template)
tokenizer_config.json CHANGED
@@ -2050,7 +2050,7 @@
2050
  }
2051
  },
2052
  "bos_token": "<|begin_of_text|>",
2053
- "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% elif message['role'] == 'tool' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% else %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'}}{% if message['content'] is not none %}\n{{ '>>>all\n' + message['content'] }}{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{{ '>>>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments'] }}{% endfor %}\n{% endif %}\n{{ '<|eot_id|>' }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|start_header_id|>{role}<|end_header_id|>\n\n' }}{% endif %}",
2054
  "clean_up_tokenization_spaces": true,
2055
  "eos_token": "<|eot_id|>",
2056
  "legacy": true,
 
2050
  }
2051
  },
2052
  "bos_token": "<|begin_of_text|>",
2053
+ "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% elif message['role'] == 'tool' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% else %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'}}{% if message['content'] is not none %}\n{{ '>>>all\n' + message['content'] }}{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{{ '>>>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments'] }}{% endfor %}\n{% endif %}\n{{ '<|eot_id|>' }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
2054
  "clean_up_tokenization_spaces": true,
2055
  "eos_token": "<|eot_id|>",
2056
  "legacy": true,