Lautaro commited on
Commit
718142c
1 Parent(s): 6a92e9f

Adding App

Browse files
Files changed (1) hide show
  1. app.py +3 -44
app.py CHANGED
@@ -12,20 +12,14 @@ from sklearn.manifold import TSNE
12
 
13
  @st.cache
14
  def load_model():
15
- model = SentenceTransformer('hackathon-pln-es/bertin-roberta-base-finetuning-esnli')
16
  model.eval()
17
  return model
18
-
19
- @st.cache
20
- def load_plot_data():
21
- embs = np.load('semeval2015-embs.npy')
22
- data = pd.read_csv('semeval2015-data.csv')
23
- return embs, data
24
 
25
  st.title("Sentence Embedding for Spanish with Bertin")
26
- st.write("Sentence embedding for spanish trained on NLI. Used for Sentence Textual Similarity. Based on the model hackathon-pln-es/bertin-roberta-base-finetuning-esnli.")
27
  st.write("Introduce two sentence to see their cosine similarity and a graph showing them in the embedding space.")
28
- st.write("Authors: Anibal Pérez, Emilio Tomás Ariza, Lautaro Gesuelli y Mauricio Mazuecos.")
29
 
30
  sent1 = st.text_area('Enter sentence 1')
31
  sent2 = st.text_area('Enter sentence 2')
@@ -36,41 +30,6 @@ if st.button('Compute similarity'):
36
  encodings = model.encode([sent1, sent2])
37
  sim = cos_sim(encodings[0], encodings[1]).numpy().tolist()[0][0]
38
  st.text('Cosine Similarity: {0:.4f}'.format(sim))
39
-
40
- print('Generating visualization...')
41
- sentembs, data = load_plot_data()
42
- X_embedded = TSNE(n_components=2, learning_rate='auto',
43
- init='random').fit_transform(np.concatenate([sentembs, encodings], axis=0))
44
-
45
- data = data.append({'sent': sent1, 'color': '#F0E442'}, ignore_index=True) # sentence 1
46
- data = data.append({'sent': sent2, 'color': '#D55E00'}, ignore_index=True) # sentence 2
47
- data['x'] = X_embedded[:,0]
48
- data['y'] = X_embedded[:,1]
49
-
50
- source = ColumnDataSource(data)
51
-
52
- p = figure(title="Embeddings in space")
53
- p.circle(
54
- x='x',
55
- y='y',
56
- legend_label="Objects",
57
- #fill_color=["red"],
58
- color='color',
59
- fill_alpha=0.5,
60
- line_color="blue",
61
- size=14,
62
- source=source
63
- )
64
- p.add_tools(HoverTool(
65
- tooltips=[
66
- ('sent', '@sent')
67
- ],
68
- formatters={
69
- '@sent': 'printf'
70
- },
71
- mode='mouse'
72
- ))
73
- st.bokeh_chart(p, use_container_width=True)
74
  else:
75
  st.write('Missing a sentences')
76
  else:
 
12
 
13
  @st.cache
14
  def load_model():
15
+ model = SentenceTransformer('hackathon-pln-es/paraphrase-spanish-distilroberta')
16
  model.eval()
17
  return model
 
 
 
 
 
 
18
 
19
  st.title("Sentence Embedding for Spanish with Bertin")
20
+ st.write("Sentence embedding for spanish trained according to instructions in the paper [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/pdf/2004.09813.pdf) and the [documentation](https://www.sbert.net/examples/training/multilingual/README.html) accompanying its companion python package. We have used the strongest available pretrained English Bi-Encoder ([paraphrase-mpnet-base-v2](https://www.sbert.net/docs/pretrained_models.html#sentence-embedding-models)) as a teacher model, and the pretrained Spanish [BERTIN](https://huggingface.co/bertin-project/bertin-roberta-base-spanish) as the student model.Used for Sentence Textual Similarity. Based on the model hackathon-pln-es/paraphrase-spanish-distilroberta.")
21
  st.write("Introduce two sentence to see their cosine similarity and a graph showing them in the embedding space.")
22
+ st.write("Authors: Anibal Pérez, Emilio Tomás Ariza, Lautaro Gesuelli Pinto y Mauricio Mazuecos.")
23
 
24
  sent1 = st.text_area('Enter sentence 1')
25
  sent2 = st.text_area('Enter sentence 2')
 
30
  encodings = model.encode([sent1, sent2])
31
  sim = cos_sim(encodings[0], encodings[1]).numpy().tolist()[0][0]
32
  st.text('Cosine Similarity: {0:.4f}'.format(sim))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  else:
34
  st.write('Missing a sentences')
35
  else: