Spaces:
Running
Running
File size: 6,176 Bytes
5e5793b b5a5fbe 493d41a b5a5fbe 493d41a b5a5fbe 5e5793b 8a204f8 21c2f11 8a204f8 b5a5fbe 21c2f11 8a204f8 a999c8e 8a204f8 21c2f11 145f48c 21c2f11 145f48c 21c2f11 5e5793b ef0b5c6 6751661 8a204f8 dc80c0d b5a5fbe 493d41a 8a204f8 5e5793b ed9112c a999c8e ed9112c dc80c0d 8a204f8 21c2f11 9240bf4 21c2f11 9240bf4 21c2f11 9240bf4 5e5793b 8a204f8 ed9112c 8a204f8 228552f 8a204f8 21c2f11 b5a5fbe 6751661 21c2f11 ef0b5c6 21c2f11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import pandas as pd
import streamlit as st
import numpy as np
import torch
import io
import time
@st.cache(show_spinner=True,allow_output_mutation=True)
def load_model(tokenizer_name):
from transformers import AutoTokenizer
model_name_dict = {
"BERT":"bert-base-uncased",
"RoBERTa":"roberta-base",
"ALBERT":"albert-base-v2",
"GPT2":"gpt2",
#"Llama":"meta-lama/Llama-2-7b-chat-hf",
#"Gemma":"google/gemma-7b",
}
tokenizer = AutoTokenizer.from_pretrained(model_name_dict[tokenizer_name])
return tokenizer
def generate_markdown(text,color='black',font='Arial',size=20):
return f"<p style='text-align:center; color:{color}; font-family:{font}; font-size:{size}px;'>{text}</p>"
def TokenizeText(sentence,tokenizer_name):
if len(sentence)>0:
#if tokenizer_name.startswith('gpt2'):
# input_sent = tokenizer(sentence)['input_ids']
#else:
# input_sent = tokenizer(sentence)['input_ids'][1:-1]
input_sent = tokenizer(sentence)['input_ids']
encoded_sent = [str(token) for token in input_sent]
decoded_sent = [tokenizer.decode([token]) for token in input_sent]
num_tokens = len(decoded_sent)
#char_nums = [len(word)+2 for word in decoded_sent]
#word_cols = st.columns(char_nums)
#for word_col,word in zip(word_cols,decoded_sent):
#with word_col:
#st.write(word)
#st.write(' '.join(encoded_sent))
#st.write(' '.join(decoded_sent))
st.markdown(generate_markdown(' '.join(encoded_sent),size=16), unsafe_allow_html=True)
st.markdown(generate_markdown(' '.join(decoded_sent),size=16), unsafe_allow_html=True)
st.markdown(generate_markdown(f'{num_tokens} tokens'), unsafe_allow_html=True)
return num_tokens
def DeTokenizeText(input_str):
if len(input_str)>0:
input_sent = [int(element) for element in input_str.strip().split(' ')]
encoded_sent = [str(token) for token in input_sent]
decoded_sent = tokenizer.decode(input_sent)
num_tokens = len(input_sent)
#char_nums = [len(word)+2 for word in decoded_sent]
#word_cols = st.columns(char_nums)
#for word_col,word in zip(word_cols,decoded_sent):
#with word_col:
#st.write(word)
#st.write(' '.join(encoded_sent))
#st.write(' '.join(decoded_sent))
st.markdown(generate_markdown(decoded_sent), unsafe_allow_html=True)
return num_tokens
if __name__=='__main__':
# Config
max_width = 1500
padding_top = 0
padding_right = 2
padding_bottom = 0
padding_left = 2
define_margins = f"""
<style>
.appview-container .main .block-container{{
max-width: {max_width}px;
padding-top: {padding_top}rem;
padding-right: {padding_right}rem;
padding-left: {padding_left}rem;
padding-bottom: {padding_bottom}rem;
}}
</style>
"""
hide_table_row_index = """
<style>
tbody th {display:none}
.blank {display:none}
</style>
"""
st.markdown(define_margins, unsafe_allow_html=True)
st.markdown(hide_table_row_index, unsafe_allow_html=True)
# Title
st.markdown(generate_markdown('WordPiece Explorer',size=32), unsafe_allow_html=True)
st.markdown(generate_markdown('- quick and easy way to explore how tokenizers work -',size=24), unsafe_allow_html=True)
# Select and load the tokenizer
st.sidebar.write('1. Choose the tokenizer from below')
tokenizer_name = st.sidebar.selectbox('',
("BERT","RoBERTa","ALBERT",
"GPT2"))
tokenizer = load_model(tokenizer_name)
st.sidebar.write('2. Optional settings')
comparison_mode = st.sidebar.checkbox('Compare two texts')
detokenize = st.sidebar.checkbox('de-tokenize')
st.sidebar.write(f'"Compare two texts" compares # tokens for two pieces of text '\
+f'and "de-tokenize" converts a list of tokenized indices back to strings.')
st.sidebar.write(f'For "de-tokenize", make sure to type in integers, separated by single spaces.')
if comparison_mode:
sent_cols = st.columns(2)
num_tokens = {}
sents = {}
for sent_id, sent_col in enumerate(sent_cols):
with sent_col:
if detokenize:
sentence = st.text_input(f'Tokenized IDs {sent_id+1}')
num_tokens[f'sent_{sent_id+1}'] = DeTokenizeText(sentence)
else:
sentence = st.text_input(f'Text {sent_id+1}')
num_tokens[f'sent_{sent_id+1}'] = TokenizeText(sentence,tokenizer_name)
sents[f'sent_{sent_id+1}'] = sentence
if len(sents['sent_1'])>0 and len(sents['sent_2'])>0:
st.markdown(generate_markdown('# Tokens: ',size=16), unsafe_allow_html=True)
if num_tokens[f'sent_1']==num_tokens[f'sent_2']:
st.markdown(generate_markdown('Matched! ',color='MediumAquamarine'), unsafe_allow_html=True)
else:
st.markdown(generate_markdown('Not Matched... ',color='Salmon'), unsafe_allow_html=True)
else:
if detokenize:
#if tokenizer_name.startswith('gpt2'):
# default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids']
#else:
# default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids'][1:-1]
default_tokens = tokenizer('Tokenizers decompose bigger words into smaller tokens')['input_ids']
sentence = st.text_input(f'Tokenized IDs',value=' '.join([str(token) for token in default_tokens]))
num_tokens = DeTokenizeText(sentence)
else:
sentence = st.text_input(f'Text',value='Tokenizers decompose bigger words into smaller tokens')
num_tokens = TokenizeText(sentence,tokenizer_name)
|