Spaces:
Running
Running
import pandas as pd | |
import streamlit as st | |
import numpy as np | |
import torch | |
from transformers import AlbertTokenizer | |
import io | |
import time | |
def load_model(model_name): | |
if model_name.startswith('albert'): | |
tokenizer = AlbertTokenizer.from_pretrained(model_name) | |
return tokenizer | |
if __name__=='__main__': | |
# Config | |
max_width = 1500 | |
padding_top = 0 | |
padding_right = 2 | |
padding_bottom = 0 | |
padding_left = 2 | |
define_margins = f""" | |
<style> | |
.appview-container .main .block-container{{ | |
max-width: {max_width}px; | |
padding-top: {padding_top}rem; | |
padding-right: {padding_right}rem; | |
padding-left: {padding_left}rem; | |
padding-bottom: {padding_bottom}rem; | |
}} | |
</style> | |
""" | |
hide_table_row_index = """ | |
<style> | |
tbody th {display:none} | |
.blank {display:none} | |
</style> | |
""" | |
st.markdown(define_margins, unsafe_allow_html=True) | |
st.markdown(hide_table_row_index, unsafe_allow_html=True) | |
input_type = st.sidebar.radio( | |
label='1. Choose the input type', | |
on_change=clear_df, | |
options=('Use one of the example sentences','Use your own initial sentence') | |
) | |
# Title | |
st.header("Tokenizer Demo") | |
tokenizer = load_model('albert-xxlarge-v2') | |
sent_cols = st.columns(2) | |
num_tokens = {} | |
for sent_id, sent_col in enumerate(sent_cols): | |
with sent_col: | |
sentence = st.text_input(f'Sentence {sent_id+1}') | |
input_sent = tokenizer(sentence)['input_ids'] | |
decoded_sent = [tokenizer.decode([token]) for token in input_sent[1:-1]] | |
num_tokens[f'sent_{sent_id}'] = len(decoded_sent) | |
char_nums = [len(word)+2 for word in decoded_sent] | |
word_cols = st.columns(char_nums) | |
for word_col,word in zip(word_cols,decoded_sent): | |
with word_col: | |
st.write(word) | |
st.write(f'{num_tokens[f'sent_{sent_id}']} tokens') | |
if num_tokens[f'sent_1']==num_tokens[f'sent_2']: | |
st.write('Matched!') | |
else: | |
st.write('Not Matched...') | |