import pandas as pd
import streamlit as st
import numpy as np
import torch
from transformers import AlbertTokenizer
import io
import time
@st.cache(show_spinner=True,allow_output_mutation=True)
def load_model(model_name):
if model_name.startswith('albert'):
tokenizer = AlbertTokenizer.from_pretrained(model_name)
return tokenizer
if __name__=='__main__':
# Config
max_width = 1500
padding_top = 0
padding_right = 2
padding_bottom = 0
padding_left = 2
define_margins = f"""
"""
hide_table_row_index = """
"""
st.markdown(define_margins, unsafe_allow_html=True)
st.markdown(hide_table_row_index, unsafe_allow_html=True)
# Title
st.header("Tokenizer Demo")
tokenizer = load_model('albert-xxlarge-v2')
sent_cols = st.columns(2)
num_tokens = {}
for sent_id, sent_col in enumerate(sent_cols):
with sent_col:
sentence = st.text_input(f'Sentence {sent_id+1}')
input_sent = tokenizer(sentence)['input_ids']
decoded_sent = [tokenizer.decode([token]) for token in input_sent[1:-1]]
num_tokens[f'sent_{sent_id}'] = len(decoded_sent)
char_nums = [len(word)+2 for word in decoded_sent]
word_cols = st.columns(char_nums)
for word_col,word in zip(word_cols,decoded_sent):
with word_col:
st.write(word)
st.write(f'{len(decoded_sent)} tokens')
if num_tokens[f'sent_1']==num_tokens[f'sent_2']:
st.subheader('Matched!')
else:
st.subheader('Not Matched...')