Spaces:
Running
Running
update
Browse files- README.md +4 -1
- vocab/moss/__init__.py +4 -3
- vocab/qwen_7b_chat/__init__.py +5 -4
- vocab/skywork_13b_base/__init__.py +6 -0
- vocab/skywork_13b_math/__init__.py +4 -0
README.md
CHANGED
@@ -15,7 +15,10 @@ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-
|
|
15 |
## ss
|
16 |
|
17 |
|
18 |
-
##
|
|
|
|
|
|
|
19 |
|
20 |
|
21 |
|
|
|
15 |
## ss
|
16 |
|
17 |
|
18 |
+
## TODO
|
19 |
+
|
20 |
+
|
21 |
+
'MossTokenizer' object has no attribute 'encoder'
|
22 |
|
23 |
|
24 |
|
vocab/moss/__init__.py
CHANGED
@@ -2,10 +2,11 @@
|
|
2 |
import os
|
3 |
from transformers import AutoTokenizer, BloomTokenizerFast
|
4 |
|
5 |
-
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
6 |
-
TOKENIZER_DIR = os.path.join(CURRENT_DIR, "moss-moon-003-sft")
|
|
|
7 |
|
8 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
9 |
|
10 |
# vocab_size = len(tokenizer.get_vocab())
|
11 |
# vocab_size = tokenizer.vocab_size
|
|
|
2 |
import os
|
3 |
from transformers import AutoTokenizer, BloomTokenizerFast
|
4 |
|
5 |
+
# CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
6 |
+
# TOKENIZER_DIR = os.path.join(CURRENT_DIR, "moss-moon-003-sft")
|
7 |
+
# tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
|
8 |
|
9 |
+
tokenizer = AutoTokenizer.from_pretrained("fnlp/moss-moon-003-sft", trust_remote_code=True)
|
10 |
|
11 |
# vocab_size = len(tokenizer.get_vocab())
|
12 |
# vocab_size = tokenizer.vocab_size
|
vocab/qwen_7b_chat/__init__.py
CHANGED
@@ -9,12 +9,13 @@ https://github.com/QwenLM/Qwen/blob/main/tokenization_note_zh.md
|
|
9 |
|
10 |
import os
|
11 |
from transformers import AutoTokenizer
|
12 |
-
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
13 |
-
TOKENIZER_DIR = os.path.join(CURRENT_DIR, "Qwen-7B-Chat")
|
14 |
|
15 |
# 请注意:分词器默认行为已更改为默认关闭特殊token攻击防护。
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
18 |
|
19 |
tokenizer.comments = "在gpt4词典基础上,删除了100个多数字token,增加10000中文词token;并优化了special_token的分词"
|
20 |
|
|
|
9 |
|
10 |
import os
|
11 |
from transformers import AutoTokenizer
|
|
|
|
|
12 |
|
13 |
# 请注意:分词器默认行为已更改为默认关闭特殊token攻击防护。
|
14 |
+
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
|
15 |
+
|
16 |
+
# CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
17 |
+
# TOKENIZER_DIR = os.path.join(CURRENT_DIR, "Qwen-7B-Chat")
|
18 |
+
# tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
|
19 |
|
20 |
tokenizer.comments = "在gpt4词典基础上,删除了100个多数字token,增加10000中文词token;并优化了special_token的分词"
|
21 |
|
vocab/skywork_13b_base/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
|
5 |
+
|
6 |
+
tokenizer = AutoTokenizer.from_pretrained("Skywork/Skywork-13B-base", trust_remote_code=True)
|
vocab/skywork_13b_math/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("Skywork/Skywork-13B-Math", trust_remote_code=True)
|