tokenizer-demo / app.py
taka-yamakoshi
first commit
5e5793b
raw
history blame
2.25 kB
import pandas as pd
import streamlit as st
import numpy as np
import torch
from transformers import AlbertTokenizer
import io
import time
@st.cache(show_spinner=True,allow_output_mutation=True)
def load_model(model_name):
if model_name.startswith('albert'):
tokenizer = AlbertTokenizer.from_pretrained(model_name)
return tokenizer
if __name__=='__main__':
# Config
max_width = 1500
padding_top = 0
padding_right = 2
padding_bottom = 0
padding_left = 2
define_margins = f"""
<style>
.appview-container .main .block-container{{
max-width: {max_width}px;
padding-top: {padding_top}rem;
padding-right: {padding_right}rem;
padding-left: {padding_left}rem;
padding-bottom: {padding_bottom}rem;
}}
</style>
"""
hide_table_row_index = """
<style>
tbody th {display:none}
.blank {display:none}
</style>
"""
st.markdown(define_margins, unsafe_allow_html=True)
st.markdown(hide_table_row_index, unsafe_allow_html=True)
input_type = st.sidebar.radio(
label='1. Choose the input type',
on_change=clear_df,
options=('Use one of the example sentences','Use your own initial sentence')
)
# Title
st.header("Tokenizer Demo")
tokenizer = load_model('albert-xxlarge-v2')
sent_cols = st.columns(2)
num_tokens = {}
for sent_id, sent_col in enumerate(sent_cols):
with sent_col:
sentence = st.text_input(f'Sentence {sent_id+1}')
input_sent = tokenizer(sentence)['input_ids']
decoded_sent = [tokenizer.decode([token]) for token in input_sent[1:-1]]
num_tokens[f'sent_{sent_id}'] = len(decoded_sent)
char_nums = [len(word)+2 for word in decoded_sent]
word_cols = st.columns(char_nums)
for word_col,word in zip(word_cols,decoded_sent):
with word_col:
st.write(word)
st.write(f'{num_tokens[f'sent_{sent_id}']} tokens')
if num_tokens[f'sent_1']==num_tokens[f'sent_2']:
st.write('Matched!')
else:
st.write('Not Matched...')