fix tokenization

Browse files

Files changed (2) hide show

tokenization_functionary.py +32 -28
tokenizer_config.json +1 -1

tokenization_functionary.py CHANGED Viewed

@@ -38,10 +38,8 @@ class Tool(BaseModel):
 def convert_data_type(param_type: str) -> str:
     """convert data_type to typescript data type
     Args:
         param_type (str): param_type
     Returns:
         str: param type in typescript
     """
@@ -52,10 +50,8 @@ def convert_data_type(param_type: str) -> str:
 def get_param_type(param: Dict) -> str:
     """get param_type of parameter
     Args:
         param (Dict): param dict in properties
     Returns:
         str: _description_
     """
@@ -80,10 +76,8 @@ def get_param_type(param: Dict) -> str:
 def get_format_param(param: Dict) -> Optional[str]:
     """Get "format" from param. There are cases where format is not directly in param but in oneOf
     Args:
         param (Dict): _description_
     Returns:
         Optional[str]: _description_
     """
@@ -101,10 +95,8 @@ def get_format_param(param: Dict) -> Optional[str]:
 def get_param_info(param: Dict) -> Optional[str]:
     """get additional information about parameter such as: format, default value, min, max, ...
     Args:
         param (Dict): _description_
     Returns:
         Optional[str]: _description_
     """
@@ -150,7 +142,6 @@ def append_new_param_info(
     depth: int,
 ):
     """Append a new parameter with comment to the info_list
     Args:
         info_lines (List[str]): current info_list
         param_declaration (str): param: type
@@ -176,11 +167,9 @@ def append_new_param_info(
 def get_examples_info(param_name: str, examples: List) -> List:
     """get information about examples provided
     Args:
         param_name (str): _description_
         examples (List): _description_
     Returns:
         List: _description_
     """
@@ -197,10 +186,8 @@ def get_examples_info(param_name: str, examples: List) -> List:
 def get_enum_option_str(enum_options: List) -> str:
     """get enum option separated by: "|"
     Args:
         enum_options (List): list of options
     Returns:
         _type_: concatenation of options separated by "|"
     """
@@ -212,12 +199,10 @@ def get_array_typescript(
     param_name: Optional[str], param_dic: dict, depth: int = 0
 ) -> str:
     """recursive implementation for generating type script of array
     Args:
         param_name (Optional[str]): name of param, optional
         param_dic (dict): param_dic
         depth (int, optional): nested level. Defaults to 0.
     Returns:
         _type_: typescript of array
     """
@@ -270,12 +255,10 @@ def get_array_typescript(
 def get_parameter_typescript(properties, required_params, depth=0) -> List[str]:
     """Recursion, returning the information about parameters including data type, description and other information
     These kinds of information will be put into the prompt
     Args:
         properties (_type_): properties in parameters
         required_params (_type_): List of required parameters
         depth (int, optional): the depth of params (nested level). Defaults to 0.
     Returns:
         _type_: list of lines containing information about all parameters
     """
@@ -461,20 +444,41 @@ class FunctionaryTokenizer(PreTrainedTokenizerFast):
                 "point any code depending on them will stop working. We recommend setting a valid chat template before "
                 "then to ensure that this model continues working without issues."
             )
         # Prepare tools/functions into schema
         functions_pydantic_to_render = []
         has_code_interpreter = False
-        for i in range(len(tools)):
-            tool_pydantic = Tool.model_validate(tools[i])
-            if tool_pydantic.type == "function":
-                functions_pydantic_to_render.append(tool_pydantic.function)
-            else:
-                has_code_interpreter = True
-        conversation.insert(0, {"role": "system", "content": generate_schema_from_functions(functions_pydantic_to_render)})
-        # Insert system prompt
-        system_prompt_to_use = SYSTEM_PROMPT if not has_code_interpreter else CODE_INTERPRETER_SYSTEM_PROMPT
-        conversation.insert(1, {"role": "system", "content": system_prompt_to_use})
         # Compilation function uses a cache to avoid recompiling the same template
         compiled_template = self._compile_jinja_template(chat_template)

 def convert_data_type(param_type: str) -> str:
     """convert data_type to typescript data type
     Args:
         param_type (str): param_type
     Returns:
         str: param type in typescript
     """
 def get_param_type(param: Dict) -> str:
     """get param_type of parameter
     Args:
         param (Dict): param dict in properties
     Returns:
         str: _description_
     """
 def get_format_param(param: Dict) -> Optional[str]:
     """Get "format" from param. There are cases where format is not directly in param but in oneOf
     Args:
         param (Dict): _description_
     Returns:
         Optional[str]: _description_
     """
 def get_param_info(param: Dict) -> Optional[str]:
     """get additional information about parameter such as: format, default value, min, max, ...
     Args:
         param (Dict): _description_
     Returns:
         Optional[str]: _description_
     """
     depth: int,
 ):
     """Append a new parameter with comment to the info_list
     Args:
         info_lines (List[str]): current info_list
         param_declaration (str): param: type
 def get_examples_info(param_name: str, examples: List) -> List:
     """get information about examples provided
     Args:
         param_name (str): _description_
         examples (List): _description_
     Returns:
         List: _description_
     """
 def get_enum_option_str(enum_options: List) -> str:
     """get enum option separated by: "|"
     Args:
         enum_options (List): list of options
     Returns:
         _type_: concatenation of options separated by "|"
     """
     param_name: Optional[str], param_dic: dict, depth: int = 0
 ) -> str:
     """recursive implementation for generating type script of array
     Args:
         param_name (Optional[str]): name of param, optional
         param_dic (dict): param_dic
         depth (int, optional): nested level. Defaults to 0.
     Returns:
         _type_: typescript of array
     """
 def get_parameter_typescript(properties, required_params, depth=0) -> List[str]:
     """Recursion, returning the information about parameters including data type, description and other information
     These kinds of information will be put into the prompt
     Args:
         properties (_type_): properties in parameters
         required_params (_type_): List of required parameters
         depth (int, optional): the depth of params (nested level). Defaults to 0.
     Returns:
         _type_: list of lines containing information about all parameters
     """
                 "point any code depending on them will stop working. We recommend setting a valid chat template before "
                 "then to ensure that this model continues working without issues."
             )
+        PYTHON_RUN_SYS_MSG = "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 60.0 seconds. The drive at '/mnt/data' can be used to save and persist user files."
+        SYSTEM_CONTENT = """You are capable of executing available function(s) if required.
+Only execute function(s) when absolutely necessary.
+Ask for the required input to:recipient==all
+Use JSON for function arguments.
+Respond in this format:
+>>>${recipient}
+${content}
+Available functions:
+"""
         # Prepare tools/functions into schema
         functions_pydantic_to_render = []
         has_code_interpreter = False
+        if tools is not None:
+            for item in tools:
+                if (
+                    "function" in item and item["function"] is not None
+                ):  #  new data format: tools: [{"type": xx, "function": xxx}]
+                    functions_pydantic_to_render.append(item["function"])
+                elif "type" in item and item["type"] == "code_interpreter":
+                    has_code_interpreter = True
+                else:
+                    functions_pydantic_to_render.append(item)  #  old format
+        conversation.insert(
+            0,
+            {
+                "role": "system",
+                "content": SYSTEM_CONTENT + generate_schema_from_functions(functions_pydantic_to_render),
+            },
+        )
+        if has_code_interpreter:
+            conversation.insert(1, {"role": "system", "content": PYTHON_RUN_SYS_MSG})
         # Compilation function uses a cache to avoid recompiling the same template
         compiled_template = self._compile_jinja_template(chat_template)

tokenizer_config.json CHANGED Viewed

@@ -2050,7 +2050,7 @@
     }
   },
   "bos_token": "<|begin_of_text|>",
-  "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% elif message['role'] == 'tool' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% else %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'}}{% if message['content'] is not none %}\n{{ '>>>all\n' + message['content'] }}{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{{ '>>>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments'] }}{% endfor %}\n{% endif %}\n{{ '<|eot_id|>' }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|start_header_id|>{role}<|end_header_id|>\n\n' }}{% endif %}",
   "clean_up_tokenization_spaces": true,
   "eos_token": "<|eot_id|>",
   "legacy": true,

     }
   },
   "bos_token": "<|begin_of_text|>",
+  "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' or message['role'] == 'system' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% elif message['role'] == 'tool' %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] + '<|eot_id|>' }}{% else %}\n{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'}}{% if message['content'] is not none %}\n{{ '>>>all\n' + message['content'] }}{% endif %}\n{% if 'tool_calls' in message and message['tool_calls'] is not none %}\n{% for tool_call in message['tool_calls'] %}\n{{ '>>>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments'] }}{% endfor %}\n{% endif %}\n{{ '<|eot_id|>' }}{% endif %}\n{% endfor %}\n{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
   "clean_up_tokenization_spaces": true,
   "eos_token": "<|eot_id|>",
   "legacy": true,