Spaces:
Runtime error
Runtime error
adding parquets
Browse files- app.py +10 -17
- assets/data/amazon_polarity_albert-base-v2-yelp-polarity.parquet +3 -0
- assets/data/amazon_polarity_distilbert-base-uncased-finetuned-sst-2-english.parquet +3 -0
- assets/data/yelp_polarity_albert-base-v2-yelp-polarity.parquet +3 -0
- assets/data/yelp_polarity_distilbert-base-uncased-finetuned-sst-2-english.parquet +3 -0
app.py
CHANGED
@@ -109,7 +109,7 @@ def quant_panel(embedding_df):
|
|
109 |
st.markdown("* Each **point** is an input example.")
|
110 |
st.markdown("* Gray points have low-loss and the colored have high-loss. High-loss instances are clustered using **kmeans** and each color represents a cluster.")
|
111 |
st.markdown("* The **shape** of each point reflects the label category -- positive (diamond) or negative sentiment (circle).")
|
112 |
-
st.altair_chart(data_comparison(down_samp(embedding_df)))
|
113 |
|
114 |
|
115 |
def frequent_tokens(data, tokenizer, loss_quantile=0.95, top_k=200, smoothing=0.005):
|
@@ -156,15 +156,11 @@ def get_data(spotlight, emb):
|
|
156 |
|
157 |
@st.cache(ttl=600)
|
158 |
def clustering(data,num_clusters):
|
159 |
-
|
160 |
X = np.array(data['embedding'].tolist())
|
161 |
-
|
162 |
kclusterer = KMeansClusterer(
|
163 |
num_clusters, distance=nltk.cluster.util.cosine_distance,
|
164 |
repeats=25,avoid_empty_clusters=True)
|
165 |
-
|
166 |
assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
|
167 |
-
|
168 |
data['cluster'] = pd.Series(assigned_clusters, index=data.index).astype('int')
|
169 |
data['centroid'] = data['cluster'].apply(lambda x: kclusterer.means()[x])
|
170 |
|
@@ -222,22 +218,18 @@ if __name__ == "__main__":
|
|
222 |
# ******* loading the mode and the data
|
223 |
dataset = st.sidebar.selectbox(
|
224 |
"Dataset",
|
225 |
-
["amazon_polarity", "
|
226 |
-
index=
|
227 |
)
|
228 |
|
229 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
230 |
-
"distilbert-base-uncased-finetuned-sst-2-english")
|
231 |
-
|
232 |
model = st.sidebar.selectbox(
|
233 |
"Model",
|
234 |
["distilbert-base-uncased-finetuned-sst-2-english",
|
235 |
-
"
|
236 |
-
index=0
|
237 |
)
|
238 |
|
239 |
loss_quantile = st.sidebar.slider(
|
240 |
-
"Loss Quantile", min_value=0.
|
241 |
)
|
242 |
|
243 |
run_kmeans = st.sidebar.radio("Cluster error slice?", ('True', 'False'), index=0)
|
@@ -245,10 +237,11 @@ if __name__ == "__main__":
|
|
245 |
num_clusters = st.sidebar.slider("# clusters", min_value=1, max_value=20, step=1, value=3)
|
246 |
|
247 |
### LOAD DATA AND SESSION VARIABLES ###
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
|
|
252 |
if "user_data" not in st.session_state:
|
253 |
st.session_state["user_data"] = data_df
|
254 |
if "selected_slice" not in st.session_state:
|
|
|
109 |
st.markdown("* Each **point** is an input example.")
|
110 |
st.markdown("* Gray points have low-loss and the colored have high-loss. High-loss instances are clustered using **kmeans** and each color represents a cluster.")
|
111 |
st.markdown("* The **shape** of each point reflects the label category -- positive (diamond) or negative sentiment (circle).")
|
112 |
+
st.altair_chart(data_comparison(down_samp(embedding_df)), use_container_width=True)
|
113 |
|
114 |
|
115 |
def frequent_tokens(data, tokenizer, loss_quantile=0.95, top_k=200, smoothing=0.005):
|
|
|
156 |
|
157 |
@st.cache(ttl=600)
|
158 |
def clustering(data,num_clusters):
|
|
|
159 |
X = np.array(data['embedding'].tolist())
|
|
|
160 |
kclusterer = KMeansClusterer(
|
161 |
num_clusters, distance=nltk.cluster.util.cosine_distance,
|
162 |
repeats=25,avoid_empty_clusters=True)
|
|
|
163 |
assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
|
|
|
164 |
data['cluster'] = pd.Series(assigned_clusters, index=data.index).astype('int')
|
165 |
data['centroid'] = data['cluster'].apply(lambda x: kclusterer.means()[x])
|
166 |
|
|
|
218 |
# ******* loading the mode and the data
|
219 |
dataset = st.sidebar.selectbox(
|
220 |
"Dataset",
|
221 |
+
["amazon_polarity", "yelp_polarity"],
|
222 |
+
index = 1
|
223 |
)
|
224 |
|
|
|
|
|
|
|
225 |
model = st.sidebar.selectbox(
|
226 |
"Model",
|
227 |
["distilbert-base-uncased-finetuned-sst-2-english",
|
228 |
+
"albert-base-v2-yelp-polarity"],
|
|
|
229 |
)
|
230 |
|
231 |
loss_quantile = st.sidebar.slider(
|
232 |
+
"Loss Quantile", min_value=0.5, max_value=1.0,step=0.01,value=0.95
|
233 |
)
|
234 |
|
235 |
run_kmeans = st.sidebar.radio("Cluster error slice?", ('True', 'False'), index=0)
|
|
|
237 |
num_clusters = st.sidebar.slider("# clusters", min_value=1, max_value=20, step=1, value=3)
|
238 |
|
239 |
### LOAD DATA AND SESSION VARIABLES ###
|
240 |
+
data_df = pd.read_parquet('./assets/data/'+dataset+ '_'+ model+'.parquet')
|
241 |
+
if model == 'albert-base-v2-yelp-polarity':
|
242 |
+
tokenizer = AutoTokenizer.from_pretrained('textattack/'+model)
|
243 |
+
else:
|
244 |
+
tokenizer = AutoTokenizer.from_pretrained(model)
|
245 |
if "user_data" not in st.session_state:
|
246 |
st.session_state["user_data"] = data_df
|
247 |
if "selected_slice" not in st.session_state:
|
assets/data/amazon_polarity_albert-base-v2-yelp-polarity.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bce0297bedc66865c01644421ea934008d74807befb7b0bd94aa92729bd02a59
|
3 |
+
size 56644779
|
assets/data/amazon_polarity_distilbert-base-uncased-finetuned-sst-2-english.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a193c26851f48b7b76a35986ced0dc1fddafd26b92f1aaf9a4e69fd83fd2f2e4
|
3 |
+
size 56643545
|
assets/data/yelp_polarity_albert-base-v2-yelp-polarity.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6a56147880841c6f78a868fb58f6e97661547009e570c2887ef7c12ffd54474e
|
3 |
+
size 103294569
|
assets/data/yelp_polarity_distilbert-base-uncased-finetuned-sst-2-english.parquet
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:165515be2837df9b02f782fe1e7bd3b31bb01c49960e73238f77541eee7589ad
|
3 |
+
size 61796202
|