Spaces:
Build error
Build error
Adding App
Browse files
app.py
CHANGED
@@ -12,20 +12,14 @@ from sklearn.manifold import TSNE
|
|
12 |
|
13 |
@st.cache
|
14 |
def load_model():
|
15 |
-
model = SentenceTransformer('hackathon-pln-es/
|
16 |
model.eval()
|
17 |
return model
|
18 |
-
|
19 |
-
@st.cache
|
20 |
-
def load_plot_data():
|
21 |
-
embs = np.load('semeval2015-embs.npy')
|
22 |
-
data = pd.read_csv('semeval2015-data.csv')
|
23 |
-
return embs, data
|
24 |
|
25 |
st.title("Sentence Embedding for Spanish with Bertin")
|
26 |
-
st.write("Sentence embedding for spanish trained
|
27 |
st.write("Introduce two sentence to see their cosine similarity and a graph showing them in the embedding space.")
|
28 |
-
st.write("Authors: Anibal Pérez, Emilio Tomás Ariza, Lautaro Gesuelli y Mauricio Mazuecos.")
|
29 |
|
30 |
sent1 = st.text_area('Enter sentence 1')
|
31 |
sent2 = st.text_area('Enter sentence 2')
|
@@ -36,41 +30,6 @@ if st.button('Compute similarity'):
|
|
36 |
encodings = model.encode([sent1, sent2])
|
37 |
sim = cos_sim(encodings[0], encodings[1]).numpy().tolist()[0][0]
|
38 |
st.text('Cosine Similarity: {0:.4f}'.format(sim))
|
39 |
-
|
40 |
-
print('Generating visualization...')
|
41 |
-
sentembs, data = load_plot_data()
|
42 |
-
X_embedded = TSNE(n_components=2, learning_rate='auto',
|
43 |
-
init='random').fit_transform(np.concatenate([sentembs, encodings], axis=0))
|
44 |
-
|
45 |
-
data = data.append({'sent': sent1, 'color': '#F0E442'}, ignore_index=True) # sentence 1
|
46 |
-
data = data.append({'sent': sent2, 'color': '#D55E00'}, ignore_index=True) # sentence 2
|
47 |
-
data['x'] = X_embedded[:,0]
|
48 |
-
data['y'] = X_embedded[:,1]
|
49 |
-
|
50 |
-
source = ColumnDataSource(data)
|
51 |
-
|
52 |
-
p = figure(title="Embeddings in space")
|
53 |
-
p.circle(
|
54 |
-
x='x',
|
55 |
-
y='y',
|
56 |
-
legend_label="Objects",
|
57 |
-
#fill_color=["red"],
|
58 |
-
color='color',
|
59 |
-
fill_alpha=0.5,
|
60 |
-
line_color="blue",
|
61 |
-
size=14,
|
62 |
-
source=source
|
63 |
-
)
|
64 |
-
p.add_tools(HoverTool(
|
65 |
-
tooltips=[
|
66 |
-
('sent', '@sent')
|
67 |
-
],
|
68 |
-
formatters={
|
69 |
-
'@sent': 'printf'
|
70 |
-
},
|
71 |
-
mode='mouse'
|
72 |
-
))
|
73 |
-
st.bokeh_chart(p, use_container_width=True)
|
74 |
else:
|
75 |
st.write('Missing a sentences')
|
76 |
else:
|
|
|
12 |
|
13 |
@st.cache
|
14 |
def load_model():
|
15 |
+
model = SentenceTransformer('hackathon-pln-es/paraphrase-spanish-distilroberta')
|
16 |
model.eval()
|
17 |
return model
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
st.title("Sentence Embedding for Spanish with Bertin")
|
20 |
+
st.write("Sentence embedding for spanish trained according to instructions in the paper [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/pdf/2004.09813.pdf) and the [documentation](https://www.sbert.net/examples/training/multilingual/README.html) accompanying its companion python package. We have used the strongest available pretrained English Bi-Encoder ([paraphrase-mpnet-base-v2](https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models)) as a teacher model, and the pretrained Spanish [BERTIN](https://huggingface.co/bertin-project/bertin-roberta-base-spanish) as the student model.Used for Sentence Textual Similarity. Based on the model hackathon-pln-es/paraphrase-spanish-distilroberta.")
|
21 |
st.write("Introduce two sentence to see their cosine similarity and a graph showing them in the embedding space.")
|
22 |
+
st.write("Authors: Anibal Pérez, Emilio Tomás Ariza, Lautaro Gesuelli Pinto y Mauricio Mazuecos.")
|
23 |
|
24 |
sent1 = st.text_area('Enter sentence 1')
|
25 |
sent2 = st.text_area('Enter sentence 2')
|
|
|
30 |
encodings = model.encode([sent1, sent2])
|
31 |
sim = cos_sim(encodings[0], encodings[1]).numpy().tolist()[0][0]
|
32 |
st.text('Cosine Similarity: {0:.4f}'.format(sim))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
else:
|
34 |
st.write('Missing a sentences')
|
35 |
else:
|