File size: 6,176 Bytes
5e5793b
 
 
 
 
 
 
 
b5a5fbe
 
 
 
 
493d41a
b5a5fbe
493d41a
 
b5a5fbe
 
5e5793b
 
8a204f8
 
 
21c2f11
8a204f8
b5a5fbe
 
 
 
 
21c2f11
 
8a204f8
 
 
 
 
 
 
a999c8e
 
 
 
8a204f8
 
 
 
21c2f11
 
 
 
145f48c
 
21c2f11
 
 
 
 
 
 
 
145f48c
21c2f11
5e5793b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef0b5c6
6751661
8a204f8
 
dc80c0d
 
b5a5fbe
493d41a
8a204f8
5e5793b
ed9112c
a999c8e
ed9112c
dc80c0d
 
 
8a204f8
 
 
 
 
 
21c2f11
9240bf4
21c2f11
 
9240bf4
21c2f11
9240bf4
5e5793b
8a204f8
ed9112c
8a204f8
 
 
 
228552f
8a204f8
21c2f11
b5a5fbe
 
 
 
 
6751661
21c2f11
 
ef0b5c6
21c2f11
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import pandas as pd
import streamlit as st
import numpy as np
import torch
import io
import time

@st.cache(show_spinner=True,allow_output_mutation=True)
def load_model(tokenizer_name):
    from transformers import AutoTokenizer
    model_name_dict = {
        "BERT":"bert-base-uncased",
        "RoBERTa":"roberta-base",
        "ALBERT":"albert-base-v2",
        "GPT2":"gpt2",
        #"Llama":"meta-lama/Llama-2-7b-chat-hf",
        #"Gemma":"google/gemma-7b",
        }
    tokenizer = AutoTokenizer.from_pretrained(model_name_dict[tokenizer_name])
    return tokenizer

def generate_markdown(text,color='black',font='Arial',size=20):
    return f"<p style='text-align:center; color:{color}; font-family:{font}; font-size:{size}px;'>{text}</p>"

def TokenizeText(sentence,tokenizer_name):
    if len(sentence)>0:
        #if tokenizer_name.startswith('gpt2'):
        #    input_sent = tokenizer(sentence)['input_ids']
        #else:
        #    input_sent = tokenizer(sentence)['input_ids'][1:-1]
        input_sent = tokenizer(sentence)['input_ids']
        encoded_sent = [str(token) for token in input_sent]
        decoded_sent = [tokenizer.decode([token]) for token in input_sent]
        num_tokens = len(decoded_sent)

        #char_nums = [len(word)+2 for word in decoded_sent]
        #word_cols = st.columns(char_nums)
        #for word_col,word in zip(word_cols,decoded_sent):
            #with word_col:
                #st.write(word)
        #st.write('   '.join(encoded_sent))
        #st.write('   '.join(decoded_sent))
        st.markdown(generate_markdown('   '.join(encoded_sent),size=16), unsafe_allow_html=True)
        st.markdown(generate_markdown('   '.join(decoded_sent),size=16), unsafe_allow_html=True)
        st.markdown(generate_markdown(f'{num_tokens} tokens'), unsafe_allow_html=True)

        return num_tokens

def DeTokenizeText(input_str):
    if len(input_str)>0:
        input_sent = [int(element) for element in input_str.strip().split(' ')]
        encoded_sent = [str(token) for token in input_sent]
        decoded_sent = tokenizer.decode(input_sent)
        num_tokens = len(input_sent)

        #char_nums = [len(word)+2 for word in decoded_sent]
        #word_cols = st.columns(char_nums)
        #for word_col,word in zip(word_cols,decoded_sent):
            #with word_col:
                #st.write(word)
        #st.write('   '.join(encoded_sent))
        #st.write('   '.join(decoded_sent))
        st.markdown(generate_markdown(decoded_sent), unsafe_allow_html=True)
        return num_tokens

if __name__=='__main__':

    # Config
    max_width = 1500
    padding_top = 0
    padding_right = 2
    padding_bottom = 0
    padding_left = 2

    define_margins = f"""
    <style>
        .appview-container .main .block-container{{
            max-width: {max_width}px;
            padding-top: {padding_top}rem;
            padding-right: {padding_right}rem;
            padding-left: {padding_left}rem;
            padding-bottom: {padding_bottom}rem;
        }}
    </style>
    """
    hide_table_row_index = """
                <style>
                tbody th {display:none}
                .blank {display:none}
                </style>
                """
    st.markdown(define_margins, unsafe_allow_html=True)
    st.markdown(hide_table_row_index, unsafe_allow_html=True)

    # Title
    st.markdown(generate_markdown('WordPiece Explorer',size=32), unsafe_allow_html=True)
    st.markdown(generate_markdown('- quick and easy way to explore how tokenizers work -',size=24), unsafe_allow_html=True)

    # Select and load the tokenizer
    st.sidebar.write('1. Choose the tokenizer from below')
    tokenizer_name = st.sidebar.selectbox('',
                                            ("BERT","RoBERTa","ALBERT",
                                             "GPT2"))
    tokenizer = load_model(tokenizer_name)

    st.sidebar.write('2. Optional settings')
    comparison_mode = st.sidebar.checkbox('Compare two texts')
    detokenize = st.sidebar.checkbox('de-tokenize')
    st.sidebar.write(f'"Compare two texts" compares # tokens for two pieces of text '\
                        +f'and "de-tokenize" converts a list of tokenized indices back to strings.')
    st.sidebar.write(f'For "de-tokenize", make sure to type in integers, separated by single spaces.')
    if comparison_mode:
        sent_cols = st.columns(2)
        num_tokens = {}
        sents = {}
        for sent_id, sent_col in enumerate(sent_cols):
            with sent_col:
                if detokenize:
                    sentence = st.text_input(f'Tokenized IDs {sent_id+1}')
                    num_tokens[f'sent_{sent_id+1}'] = DeTokenizeText(sentence)
                else:
                    sentence = st.text_input(f'Text {sent_id+1}')
                    num_tokens[f'sent_{sent_id+1}'] = TokenizeText(sentence,tokenizer_name)
                sents[f'sent_{sent_id+1}'] = sentence

        if len(sents['sent_1'])>0 and len(sents['sent_2'])>0:
            st.markdown(generate_markdown('# Tokens&colon; ',size=16), unsafe_allow_html=True)
            if num_tokens[f'sent_1']==num_tokens[f'sent_2']:
                st.markdown(generate_markdown('Matched! ',color='MediumAquamarine'), unsafe_allow_html=True)
            else:
                st.markdown(generate_markdown('Not Matched... ',color='Salmon'), unsafe_allow_html=True)

    else:
        if detokenize:
            #if tokenizer_name.startswith('gpt2'):
            #    default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids']
            #else:
            #    default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids'][1:-1]
            default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids']
            sentence = st.text_input(f'Tokenized IDs',value=' '.join([str(token) for token in default_tokens]))
            num_tokens = DeTokenizeText(sentence)
        else:
            sentence = st.text_input(f'Text',value='Tokenizers decompose bigger words into smaller tokens')
            num_tokens = TokenizeText(sentence,tokenizer_name)