keithhon commited on
Commit
9ee87b9
1 Parent(s): ecc7fff

Upload dalle/models/tokenizer.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. dalle/models/tokenizer.py +26 -0
dalle/models/tokenizer.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------------
2
+ # Minimal DALL-E
3
+ # Copyright (c) 2021 KakaoBrain. All Rights Reserved.
4
+ # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5
+ # ------------------------------------------------------------------------------------
6
+
7
+ import os
8
+ from functools import partial
9
+ from tokenizers import CharBPETokenizer
10
+
11
+
12
+ def build_tokenizer(path: str,
13
+ context_length: int = 64,
14
+ *args,
15
+ **kwargs):
16
+ from_file = partial(CharBPETokenizer.from_file,
17
+ vocab_filename=os.path.join(path, 'bpe-16k-vocab.json'),
18
+ merges_filename=os.path.join(path, 'bpe-16k-merges.txt'),
19
+ unk_token='[UNK]')
20
+ tokenizer = from_file(*args, **kwargs)
21
+ tokenizer.add_special_tokens(['[PAD]'])
22
+ tokenizer.enable_padding(length=context_length,
23
+ pad_id=tokenizer.token_to_id('[PAD]'))
24
+ tokenizer.enable_truncation(max_length=context_length)
25
+ print(f'{path} successfully restored..')
26
+ return tokenizer