Spaces:
Running
Running
taka-yamakoshi
commited on
Commit
•
5e5793b
1
Parent(s):
8d5d895
first commit
Browse files- app.py +72 -0
- packages.txt +1 -0
- requirements.txt +7 -0
app.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import streamlit as st
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
from transformers import AlbertTokenizer
|
6 |
+
import io
|
7 |
+
import time
|
8 |
+
|
9 |
+
@st.cache(show_spinner=True,allow_output_mutation=True)
|
10 |
+
def load_model(model_name):
|
11 |
+
if model_name.startswith('albert'):
|
12 |
+
tokenizer = AlbertTokenizer.from_pretrained(model_name)
|
13 |
+
return tokenizer
|
14 |
+
|
15 |
+
|
16 |
+
if __name__=='__main__':
|
17 |
+
|
18 |
+
# Config
|
19 |
+
max_width = 1500
|
20 |
+
padding_top = 0
|
21 |
+
padding_right = 2
|
22 |
+
padding_bottom = 0
|
23 |
+
padding_left = 2
|
24 |
+
|
25 |
+
define_margins = f"""
|
26 |
+
<style>
|
27 |
+
.appview-container .main .block-container{{
|
28 |
+
max-width: {max_width}px;
|
29 |
+
padding-top: {padding_top}rem;
|
30 |
+
padding-right: {padding_right}rem;
|
31 |
+
padding-left: {padding_left}rem;
|
32 |
+
padding-bottom: {padding_bottom}rem;
|
33 |
+
}}
|
34 |
+
</style>
|
35 |
+
"""
|
36 |
+
hide_table_row_index = """
|
37 |
+
<style>
|
38 |
+
tbody th {display:none}
|
39 |
+
.blank {display:none}
|
40 |
+
</style>
|
41 |
+
"""
|
42 |
+
st.markdown(define_margins, unsafe_allow_html=True)
|
43 |
+
st.markdown(hide_table_row_index, unsafe_allow_html=True)
|
44 |
+
input_type = st.sidebar.radio(
|
45 |
+
label='1. Choose the input type',
|
46 |
+
on_change=clear_df,
|
47 |
+
options=('Use one of the example sentences','Use your own initial sentence')
|
48 |
+
)
|
49 |
+
|
50 |
+
# Title
|
51 |
+
st.header("Tokenizer Demo")
|
52 |
+
|
53 |
+
tokenizer = load_model('albert-xxlarge-v2')
|
54 |
+
sent_cols = st.columns(2)
|
55 |
+
num_tokens = {}
|
56 |
+
for sent_id, sent_col in enumerate(sent_cols):
|
57 |
+
with sent_col:
|
58 |
+
sentence = st.text_input(f'Sentence {sent_id+1}')
|
59 |
+
input_sent = tokenizer(sentence)['input_ids']
|
60 |
+
decoded_sent = [tokenizer.decode([token]) for token in input_sent[1:-1]]
|
61 |
+
num_tokens[f'sent_{sent_id}'] = len(decoded_sent)
|
62 |
+
|
63 |
+
char_nums = [len(word)+2 for word in decoded_sent]
|
64 |
+
word_cols = st.columns(char_nums)
|
65 |
+
for word_col,word in zip(word_cols,decoded_sent):
|
66 |
+
with word_col:
|
67 |
+
st.write(word)
|
68 |
+
st.write(f'{num_tokens[f'sent_{sent_id}']} tokens')
|
69 |
+
if num_tokens[f'sent_1']==num_tokens[f'sent_2']:
|
70 |
+
st.write('Matched!')
|
71 |
+
else:
|
72 |
+
st.write('Not Matched...')
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
libgl1-mesa-dev
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
transformers
|
3 |
+
sentence_transformers
|
4 |
+
opencv-python
|
5 |
+
seaborn
|
6 |
+
sklearn
|
7 |
+
protobuf~=3.19.0
|