File size: 2,247 Bytes
5e5793b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import pandas as pd
import streamlit as st
import numpy as np
import torch
from transformers import AlbertTokenizer
import io
import time

@st.cache(show_spinner=True,allow_output_mutation=True)
def load_model(model_name):
    if model_name.startswith('albert'):
        tokenizer = AlbertTokenizer.from_pretrained(model_name)
    return tokenizer


if __name__=='__main__':

    # Config
    max_width = 1500
    padding_top = 0
    padding_right = 2
    padding_bottom = 0
    padding_left = 2

    define_margins = f"""
    <style>
        .appview-container .main .block-container{{
            max-width: {max_width}px;
            padding-top: {padding_top}rem;
            padding-right: {padding_right}rem;
            padding-left: {padding_left}rem;
            padding-bottom: {padding_bottom}rem;
        }}
    </style>
    """
    hide_table_row_index = """
                <style>
                tbody th {display:none}
                .blank {display:none}
                </style>
                """
    st.markdown(define_margins, unsafe_allow_html=True)
    st.markdown(hide_table_row_index, unsafe_allow_html=True)
    input_type = st.sidebar.radio(
        label='1. Choose the input type',
        on_change=clear_df,
        options=('Use one of the example sentences','Use your own initial sentence')
    )

    # Title
    st.header("Tokenizer Demo")

    tokenizer = load_model('albert-xxlarge-v2')
    sent_cols = st.columns(2)
    num_tokens = {}
    for sent_id, sent_col in enumerate(sent_cols):
        with sent_col:
            sentence = st.text_input(f'Sentence {sent_id+1}')
            input_sent = tokenizer(sentence)['input_ids']
            decoded_sent = [tokenizer.decode([token]) for token in input_sent[1:-1]]
            num_tokens[f'sent_{sent_id}'] = len(decoded_sent)

            char_nums = [len(word)+2 for word in decoded_sent]
            word_cols = st.columns(char_nums)
            for word_col,word in zip(word_cols,decoded_sent):
                with word_col:
                    st.write(word)
            st.write(f'{num_tokens[f'sent_{sent_id}']} tokens')
    if num_tokens[f'sent_1']==num_tokens[f'sent_2']:
        st.write('Matched!')
    else:
        st.write('Not Matched...')