Spaces:

taka-yamakoshi
/

tokenizer-demo

Running

App Files Files Community

taka-yamakoshi commited on Jul 16, 2022

Commit

8a204f8

•

1 Parent(s): e9daec4

add more functions

Browse files

Files changed (1) hide show

app.py +60 -30

app.py CHANGED Viewed

@@ -2,16 +2,46 @@ import pandas as pd
 import streamlit as st
 import numpy as np
 import torch
-from transformers import AlbertTokenizer
 import io
 import time
 @st.cache(show_spinner=True,allow_output_mutation=True)
 def load_model(model_name):
-    if model_name.startswith('albert'):
         tokenizer = AlbertTokenizer.from_pretrained(model_name)
     return tokenizer
 if __name__=='__main__':
@@ -43,34 +73,34 @@ if __name__=='__main__':
     st.markdown(hide_table_row_index, unsafe_allow_html=True)
     # Title
-    st.markdown("<p style='text-align:center; color:black; font-family:Arial; font-size:32px;'>Tokenizer Demo</p>", unsafe_allow_html=True)
-    tokenizer = load_model('albert-xxlarge-v2')
-    sent_cols = st.columns(2)
-    num_tokens = {}
-    sents = {}
-    for sent_id, sent_col in enumerate(sent_cols):
-        with sent_col:
-            sentence = st.text_input(f'Sentence {sent_id+1}')
-            sents[f'sent_{sent_id+1}'] = sentence
-            if len(sentence)>0:
-                input_sent = tokenizer(sentence)['input_ids']
-                encoded_sent = [str(token) for token in input_sent[1:-1]]
-                decoded_sent = [tokenizer.decode([token]) for token in input_sent[1:-1]]
-                num_tokens[f'sent_{sent_id+1}'] = len(decoded_sent)
-                #char_nums = [len(word)+2 for word in decoded_sent]
-                #word_cols = st.columns(char_nums)
-                #for word_col,word in zip(word_cols,decoded_sent):
-                    #with word_col:
-                        #st.write(word)
-                st.write('   '.join(encoded_sent))
-                st.write('   '.join(decoded_sent))
-                st.markdown(f"<p style='text-align: center; color: black; font-family:Arial; font-size:20px;'>{len(decoded_sent)} tokens </p>", unsafe_allow_html=True)
-    if len(sents['sent_1'])>0 and len(sents['sent_2'])>0:
-        st.markdown("<p style='text-align:center; color:black; font-family:Arial; font-size:16px;'>Result&colon; </p>", unsafe_allow_html=True)
-        if num_tokens[f'sent_1']==num_tokens[f'sent_2']:
-            st.markdown("<p style='text-align:center; color:MediumAquamarine; font-family:Arial; font-size:20px;'>Matched! </p>", unsafe_allow_html=True)
-        else:
-            st.markdown("<p style='text-align:center; color:Salmon; font-family:Arial; font-size:20px;'>Not Matched... </p>", unsafe_allow_html=True)

 import streamlit as st
 import numpy as np
 import torch
 import io
 import time
 @st.cache(show_spinner=True,allow_output_mutation=True)
 def load_model(model_name):
+    if model_name.startswith('bert'):
+        from transformers import BertTokenizer
+        tokenizer = BertTokenizer.from_pretrained(model_name)
+    elif model_name.startswith('gpt2'):
+        from transformers import GPT2Tokenizer
+        tokenizer = GPT2Tokenizer.from_pretrained(model_name)
+    elif model_name.startswith('roberta'):
+        from transformers import RobertaTokenizer
+        tokenizer = RobertaTokenizer.from_pretrained(model_name)
+    elif model_name.startswith('albert'):
+        from transformers import AlbertTokenizer
         tokenizer = AlbertTokenizer.from_pretrained(model_name)
     return tokenizer
+def generate_markdown(text,color='black',font='Arial',size=20):
+    return f"<p style='text-align:center; color:{color}; font-family:{font}; font-size:{size}px;'>{text}</p>"
+def TokenizeText(sentence):
+    if len(sentence)>0:
+        input_sent = tokenizer(sentence)['input_ids']
+        encoded_sent = [str(token) for token in input_sent[1:-1]]
+        decoded_sent = [tokenizer.decode([token]) for token in input_sent[1:-1]]
+        num_tokens = len(decoded_sent)
+        #char_nums = [len(word)+2 for word in decoded_sent]
+        #word_cols = st.columns(char_nums)
+        #for word_col,word in zip(word_cols,decoded_sent):
+            #with word_col:
+                #st.write(word)
+        st.write('   '.join(encoded_sent))
+        st.write('   '.join(decoded_sent))
+        st.markdown(generate_markdown(f'{num_tokens} tokens'), unsafe_allow_html=True)
+        return num_tokens
 if __name__=='__main__':
     st.markdown(hide_table_row_index, unsafe_allow_html=True)
     # Title
+    st.markdown(generate_markdown('Tokenizer Demo',size=32), unsafe_allow_html=True)
+    # Select and load the tokenizer
+    tokenizer_name = st.selectbox('Choose the tokenizer from below',
+                                    ('bert-base-uncased','bert-large-cased',
+                                    'gpt2','gpt2-large',
+                                    'roberta-base','roberta-large',
+                                    'albert-base-v2','albert-xxlarge-v2'),index=7)
+    tokenizer = load_model(tokenizer_name)
+    comparison_mode = st.checkbox('Compare two texts')
+    if comparison_mode:
+        sent_cols = st.columns(2)
+        num_tokens = {}
+        sents = {}
+        for sent_id, sent_col in enumerate(sent_cols):
+            with sent_col:
+                sentence = st.text_input(f'Text {sent_id+1}')
+                sents[f'sent_{sent_id+1}'] = sentence
+                num_tokens[f'{sent_id+1}'] = TokenizeText(sentence)
+        if len(sents['sent_1'])>0 and len(sents['sent_2'])>0:
+            st.markdown(generate_markdown('Result&colon; ',size=16), unsafe_allow_html=True)
+            if num_tokens[f'sent_1']==num_tokens[f'sent_2']:
+                st.markdown(generate_markdown('Matched! ',color='MediumAquamarine'), unsafe_allow_html=True)
+            else:
+                st.markdown(generate_markdown('Not Matched... ',color='Salmon'), unsafe_allow_html=True)
+    else:
+        sentence = st.text_input(f'Text')
+        num_tokens = TokenizeText(sentence)