Spaces:

taka-yamakoshi
/

tokenizer-demo

Running

taka-yamakoshi commited on Jul 16, 2022

Commit

9240bf4

•

1 Parent(s): 21c2f11

minor update

Files changed (1) hide show

app.py CHANGED Viewed

@@ -106,19 +106,20 @@ if __name__=='__main__':
     tokenizer = load_model(tokenizer_name)
     comparison_mode = st.sidebar.checkbox('Compare two texts')
-    detokenize = st.sidebar.checkbox('de-tokenize')
     if comparison_mode:
         sent_cols = st.columns(2)
         num_tokens = {}
         sents = {}
         for sent_id, sent_col in enumerate(sent_cols):
             with sent_col:
-                sentence = st.text_input(f'Text {sent_id+1}')
-                sents[f'sent_{sent_id+1}'] = sentence
                 if detokenize:
                     num_tokens[f'sent_{sent_id+1}'] = DeTokenizeText(sentence)
                 else:
                     num_tokens[f'sent_{sent_id+1}'] = TokenizeText(sentence,tokenizer_name)
         if len(sents['sent_1'])>0 and len(sents['sent_2'])>0:
             st.markdown(generate_markdown('Result&colon; ',size=16), unsafe_allow_html=True)
@@ -128,8 +129,9 @@ if __name__=='__main__':
                 st.markdown(generate_markdown('Not Matched... ',color='Salmon'), unsafe_allow_html=True)
     else:
-        sentence = st.text_input(f'Text')
         if detokenize:
             num_tokens = DeTokenizeText(sentence)
         else:
             num_tokens = TokenizeText(sentence,tokenizer_name)

     tokenizer = load_model(tokenizer_name)
     comparison_mode = st.sidebar.checkbox('Compare two texts')
+    detokenize = st.sidebar.checkbox('de-tokenize (make sure to type in integers separated by single spaces)')
     if comparison_mode:
         sent_cols = st.columns(2)
         num_tokens = {}
         sents = {}
         for sent_id, sent_col in enumerate(sent_cols):
             with sent_col:
                 if detokenize:
+                    sentence = st.text_input(f'Tokenized IDs {sent_id+1}')
                     num_tokens[f'sent_{sent_id+1}'] = DeTokenizeText(sentence)
                 else:
+                    sentence = st.text_input(f'Text {sent_id+1}')
                     num_tokens[f'sent_{sent_id+1}'] = TokenizeText(sentence,tokenizer_name)
+                sents[f'sent_{sent_id+1}'] = sentence
         if len(sents['sent_1'])>0 and len(sents['sent_2'])>0:
             st.markdown(generate_markdown('Result&colon; ',size=16), unsafe_allow_html=True)
                 st.markdown(generate_markdown('Not Matched... ',color='Salmon'), unsafe_allow_html=True)
     else:
         if detokenize:
+            sentence = st.text_input(f'Tokenized IDs')
             num_tokens = DeTokenizeText(sentence)
         else:
+            sentence = st.text_input(f'Text')
             num_tokens = TokenizeText(sentence,tokenizer_name)