xu-song commited on
Commit
d2417c7
1 Parent(s): ae282a4

add more tokenizer

Browse files
vocab/__init__.py CHANGED
@@ -46,7 +46,7 @@ tokenizer.special_tokens_map
46
  tokenizer.dependency [sentencepiece, tiktoken, icetk]
47
  """
48
 
49
- Animal = Enum('Animal', 'ANT BEE CAT DOG')
50
 
51
  uniq_tokenizers = [
52
  ""
@@ -95,6 +95,9 @@ all_tokenizers = [
95
  "baichuan",
96
  "baichuan2",
97
  "internlm_chat_7b",
 
 
 
98
  "falcon_7b",
99
  "falcon_180b",
100
  # "goat",
@@ -111,7 +114,8 @@ all_tokenizers = [
111
  # 未分类
112
  "skywork_13b_base",
113
  "skywork_13b_math",
114
- "mistral",
 
115
  "t5_small",
116
  "t5_base",
117
  "t5_large",
@@ -119,6 +123,13 @@ all_tokenizers = [
119
  "fastchat_t5_3b",
120
  "pko_t5_large",
121
  "wizardcoder_15b_v1",
 
 
 
 
 
 
 
122
  "wizardcoder_python_7b_v1",
123
  "wizardlm_7b_v1",
124
  "wizardmath_70b_v1",
@@ -128,7 +139,6 @@ all_tokenizers = [
128
  "deepseek_llm_7b_base",
129
 
130
 
131
-
132
  ]
133
 
134
  all_tokenizers = sorted(all_tokenizers)
 
46
  tokenizer.dependency [sentencepiece, tiktoken, icetk]
47
  """
48
 
49
+ # Animal = Enum('Animal', 'ANT BEE CAT DOG')
50
 
51
  uniq_tokenizers = [
52
  ""
 
95
  "baichuan",
96
  "baichuan2",
97
  "internlm_chat_7b",
98
+ "internlm2_chat_7b",
99
+ "internlm2_math_7b",
100
+ "internlm_xcomposer_7b",
101
  "falcon_7b",
102
  "falcon_180b",
103
  # "goat",
 
114
  # 未分类
115
  "skywork_13b_base",
116
  "skywork_13b_math",
117
+ "mistral_7b",
118
+ "mixtral_8_7b",
119
  "t5_small",
120
  "t5_base",
121
  "t5_large",
 
123
  "fastchat_t5_3b",
124
  "pko_t5_large",
125
  "wizardcoder_15b_v1",
126
+ "yi_6b",
127
+ "yi_34b",
128
+ "yi_vl34b",
129
+ "orion_14b_chat",
130
+ "phi_1",
131
+ "phi_2",
132
+ "solar_10_7b",
133
  "wizardcoder_python_7b_v1",
134
  "wizardlm_7b_v1",
135
  "wizardmath_70b_v1",
 
139
  "deepseek_llm_7b_base",
140
 
141
 
 
142
  ]
143
 
144
  all_tokenizers = sorted(all_tokenizers)
vocab/internlm2_chat_7b/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("internlm/internlm2-chat-7b", trust_remote_code=True)
vocab/internlm2_math_7b/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("internlm/internlm2-math-7b", trust_remote_code=True)
vocab/internlm_xcomposer_7b/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-xcomposer-7b", trust_remote_code=True)
vocab/{mistral → mistral_7b}/Mistral-7B-v0.1/special_tokens_map.json RENAMED
File without changes
vocab/{mistral → mistral_7b}/Mistral-7B-v0.1/tokenizer.json RENAMED
File without changes
vocab/{mistral → mistral_7b}/Mistral-7B-v0.1/tokenizer.model RENAMED
File without changes
vocab/{mistral → mistral_7b}/Mistral-7B-v0.1/tokenizer_config.json RENAMED
File without changes
vocab/{mistral → mistral_7b}/README.md RENAMED
File without changes
vocab/{mistral → mistral_7b}/__init__.py RENAMED
File without changes
vocab/mixtral_8_7b/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from transformers import AutoTokenizer
2
+ tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-v0.1", trust_remote_code=True)
vocab/orion_14b_chat/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("OrionStarAI/Orion-14B-Chat", trust_remote_code=True)
vocab/phi_1/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1", trust_remote_code=True)
vocab/phi_2/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
vocab/solar_10_7b/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("Upstage/SOLAR-10.7B-v1.0")
vocab/yi_34b/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+
3
+ from transformers import AutoTokenizer
4
+ tokenizer = AutoTokenizer.from_pretrained("01-ai/Yi-34B", trust_remote_code=True)
vocab/yi_6b/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer
3
+ tokenizer = AutoTokenizer.from_pretrained("01-ai/Yi-6B", trust_remote_code=True)
vocab/yi_vl34b/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+
3
+ Yi-VL adopts the LLaVA architecture,
4
+ """
5
+
6
+
7
+
8
+ from transformers import AutoTokenizer
9
+ tokenizer = AutoTokenizer.from_pretrained("01-ai/Yi-VL-34B", trust_remote_code=True)