Spaces:

gabrielanicole
/

representation_download

Sleeping

App Files Files Community

Gabriela Nicole Gonzalez Saez commited on 15 days ago

Commit

8d6b878

•

1 Parent(s): bb65bbe

app file

Browse files

Files changed (3) hide show

app.py +758 -0
plotsjs.js +744 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,758 @@

+import gradio as gr
+from time import time
+import torch
+import os
+# import nltk
+import argparse
+import random
+import numpy as np
+import faiss
+from argparse import Namespace
+from tqdm.notebook import tqdm
+from torch.utils.data import DataLoader
+from functools import partial
+from sklearn.manifold import TSNE
+from transformers import AutoTokenizer, MarianTokenizer, AutoModel, AutoModelForSeq2SeqLM, MarianMTModel
+import os
+dir_path = os.path.dirname(os.path.realpath(__file__))
+print(dir_path)
+metadata_all = {}
+model_es = "Helsinki-NLP/opus-mt-en-es"
+model_fr = "Helsinki-NLP/opus-mt-en-fr"
+model_zh = "Helsinki-NLP/opus-mt-en-zh"
+tokenizer_es = AutoTokenizer.from_pretrained(model_es)
+tokenizer_fr = AutoTokenizer.from_pretrained(model_fr)
+tokenizer_zh = AutoTokenizer.from_pretrained(model_zh)
+model_tr_es = MarianMTModel.from_pretrained(model_es)
+model_tr_fr = MarianMTModel.from_pretrained(model_fr)
+model_tr_zh = MarianMTModel.from_pretrained(model_zh)
+dict_models = {
+	'en-es': model_es,
+	'en-fr': model_fr,
+	'en-zh': model_zh,
+}
+dict_models_tr = {
+	'en-es': model_tr_es,
+	'en-fr': model_tr_fr,
+	'en-zh': model_tr_zh,
+}
+dict_tokenizer_tr = {
+	'en-es': tokenizer_es,
+	'en-fr': tokenizer_fr,
+	'en-zh': tokenizer_zh,
+}
+from faiss import write_index, read_index
+import pickle
+def translation_model(w1,model ):
+	inputs = dict_tokenizer_tr[model](w1, return_tensors="pt")
+	# embeddings = get_tokens_embeddings(inputs, model)
+	input_embeddings = dict_models_tr[model].get_encoder().embed_tokens(inputs.input_ids)
+	# model_tr_es.get_input_embeddings()
+	print(inputs)
+	num_ret_seq = 1
+	translated  = dict_models_tr[model].generate(**inputs,
+											  num_beams=5,
+											  num_return_sequences=num_ret_seq,
+											  return_dict_in_generate=True,
+											  output_attentions =False,
+											  output_hidden_states = True,
+											  output_scores=True,)
+	tgt_text = dict_tokenizer_tr[model].decode(translated.sequences[0], skip_special_tokens=True)
+	target_embeddings = dict_models_tr[model].get_decoder().embed_tokens(translated.sequences)
+	return tgt_text, translated, inputs.input_ids, input_embeddings, target_embeddings
+def create_vocab_multiple(embeddings_list, model):
+	"""_summary_
+	Args:
+		embeddings_list (list): embedding array
+	Returns:
+		Dict: vocabulary of tokens' embeddings
+	"""
+	print("START VOCAB CREATION MULTIPLE \n \n ")
+	vocab = {} ## add embedds.
+	sentence_tokens_text_list = []
+	for embeddings in embeddings_list:
+		tokens_id = embeddings['tokens'] # [[tokens_id]x n_sentences ]
+		for sent_i, sentence in enumerate(tokens_id):
+			sentence_tokens = []
+			for tok_i, token in enumerate(sentence):
+				sentence_tokens.append(token)
+				if not (token in vocab):
+					vocab[token] = {
+						'token' : token,
+						'count': 1,
+						# 'text': embeddings['texts'][sent_i][tok_i],
+						'text': dict_tokenizer_tr[model].decode([token]),
+						# 'text': src_token_lists[sent_i][tok_i],
+						'embed': embeddings['embeddings'][sent_i][tok_i]}
+				else:
+					vocab[token]['count'] = vocab[token]['count'] + 1
+		# print(vocab)
+			sentence_tokens_text_list.append(sentence_tokens)
+	print("END VOCAB CREATION MULTIPLE \n \n ")
+	return vocab, sentence_tokens_text_list
+def vocab_words_all_prefix(token_embeddings, model, sufix="@@",prefix = '▁' ):
+	vocab = {}
+	# inf_model = dict_models_tr[model]
+	sentence_words_text_list = []
+	if prefix :
+		n_prefix = len(prefix)
+		for input_sentences in token_embeddings:
+			# n_tokens_in_word
+			for sent_i, sentence in enumerate(input_sentences['tokens']):
+				words_text_list = []
+				# embedding = input_sentences['embed'][sent_i]
+				word = ''
+				tokens_ids = []
+				embeddings = []
+				ids_to_tokens = dict_tokenizer_tr[model].convert_ids_to_tokens(sentence)
+				# print("validate same len", len(sentence) == len(ids_to_tokens), len(sentence), len(ids_to_tokens), ids_to_tokens)
+				to_save= False
+				for tok_i, token_text in enumerate(ids_to_tokens):
+					token_id = sentence[tok_i]
+					if token_text[:n_prefix] == prefix :
+						#first we save the previous word
+						if to_save:
+							vocab[word] = {
+									'word' : word,
+									'text': word,
+									'count': 1,
+									'tokens_ids' : tokens_ids,
+									'embed': np.mean(np.array(embeddings), 0).tolist()
+								}
+							words_text_list.append(word)
+						#word is starting if prefix
+						tokens_ids = [token_id]
+						embeddings = [input_sentences['embeddings'][sent_i][tok_i]]
+						word = token_text[n_prefix:]
+						## if word
+						to_save = True
+					else :
+						if (token_text in dict_tokenizer_tr[model].special_tokens_map.values()):
+							# print('final or save', token_text, token_id, to_save, word)
+							if to_save:
+								# vocab[word] = ids
+								vocab[word] = {
+									'word' : word,
+									'text': word,
+									'count': 1,
+									'tokens_ids' : tokens_ids,
+									'embed': np.mean(np.array(embeddings), 0).tolist()
+								}
+								words_text_list.append(word)
+							#special token is one token element, no continuation
+							# vocab[token_text] = [token_id]
+							tokens_ids = [token_id]
+							embeddings = [input_sentences['embeddings'][sent_i][tok_i]]
+							vocab[token_text] = {
+									'word' : token_text,
+									'count': 1,
+									'text': word,
+									'tokens_ids' : tokens_ids,
+									'embed': np.mean(np.array(embeddings), 0).tolist()
+								}
+							words_text_list.append(token_text)
+							to_save = False
+						else:
+							# is a continuation; we do not know if it is final; we don't save here.
+							to_save = True
+							word += token_text
+							tokens_ids.append(token_id)
+							embeddings.append(input_sentences['embeddings'][sent_i][tok_i])
+				if to_save:
+					# print('final save', token_text, token_id, to_save, word)
+					vocab[word] = tokens_ids
+					if not (word in vocab):
+						vocab[word] = {
+							'word' : word,
+							'count': 1,
+							'text': word,
+							'tokens_ids' : tokens_ids,
+							'embed': np.mean(np.array(embeddings), 0).tolist()
+							}
+						words_text_list.append(word)
+					else:
+						vocab[word]['count'] = vocab[word]['count'] + 1
+				sentence_words_text_list.append(words_text_list)
+	return vocab, sentence_words_text_list
+# nb_ids.append(token_values['token']) # for x in vocab_tokens]
+# nb_embds.append(token_values['embed']) # for x in vocab_tokens]
+def create_index_voronoi(vocab):
+	"""
+	it returns an index of words and a metadata of ids.
+	"""
+	d = 1024
+	nb_embds = [] ##ordered embeddings list
+	metadata = {}
+	i_pos = 0
+	for key_token, token_values in vocab.items():
+		nb_embds.append(token_values['embed']) # for x in vocab_tokens]
+		metadata[i_pos] = {'token': token_values['token'], 'text': token_values['text']}
+		i_pos += 1
+	# nb_embds = [x['embed'] for x in vocab_tokens]
+	# print(len(nb_embds),len(nb_embds[0]) )
+	xb = np.array(nb_embds).astype('float32') #elements to index
+	# ids = np.array(nb_ids)
+	d = len(xb[0]) # dimension of each element
+	nlist = 5 # Nb of Voronois
+	quantizer = faiss.IndexFlatL2(d)
+	index = faiss.IndexIVFFlat(quantizer, d, nlist)
+	index.train(xb)
+	index.add(xb)
+	# index.add(xb)
+	return index, metadata## , nb_embds, nb_ids
+def create_index_voronoi_words(vocab):
+	"""
+	it returns an index of words and a metadata of ids.
+	"""
+	d = 1024
+	nb_embds = [] ##ordered embeddings list
+	metadata = {}
+	i_pos = 0
+	for key_token, token_values in vocab.items():
+		nb_embds.append(token_values['embed']) # for x in vocab_tokens]
+		metadata[i_pos] = {'word': token_values['word'], 'tokens': token_values['tokens_ids'],'text': token_values['text']}
+		i_pos += 1
+	# nb_embds = [x['embed'] for x in vocab_tokens]
+	# print(len(nb_embds),len(nb_embds[0]) )
+	xb = np.array(nb_embds).astype('float32') #elements to index
+	# ids = np.array(nb_ids)
+	d = len(xb[0]) # dimension of each element
+	nlist = 5 # Nb of Voronois
+	quantizer = faiss.IndexFlatL2(d)
+	index = faiss.IndexIVFFlat(quantizer, d, nlist)
+	index.train(xb)
+	index.add(xb)
+	# index.add(xb)
+	return index, metadata## , nb_embds, nb_ids
+def search_query_vocab(index, vocab_queries,  topk = 10, limited_search = []):
+	""" the embed queries are a vocabulary of words : embds_input_voc
+	Args:
+		index (_type_): faiss index
+		embed_queries (_type_): vocab format.
+			{   'token' : token,
+				'count': 1,
+				'text': src_token_lists[sent_i][tok_i],
+				'embed': embeddings[0]['embeddings'][sent_i][tok_i] }
+		nb_ids (_type_): hash to find the token_id w.r.t the faiss index id.
+		topk (int, optional): nb of similar tokens. Defaults to 10.
+	Returns:
+		_type_: Distance matrix D, indices matrix I and tokens ids (using nb_ids)
+	"""
+	# nb_qi_ids = [] ##ordered ids list
+	nb_q_embds = [] ##ordered embeddings list
+	metadata = {}
+	qi_pos = 0
+	for key , token_values in vocab_queries.items():
+		# nb_qi_ids.append(token_values['token']) # for x in vocab_tokens]
+		metadata[qi_pos] = {'word': token_values['word'], 'tokens': token_values['tokens_ids'], 'text': token_values['text']}
+		qi_pos += 1
+		nb_q_embds.append(token_values['embed']) # for x in vocab_tokens]
+	xq = np.array(nb_q_embds).astype('float32') #elements to query
+	D,I = index.search(xq, topk)
+	return D,I, metadata
+def search_query_vocab_token(index, vocab_queries,  topk = 10, limited_search = []):
+	""" the embed queries are a vocabulary of words : embds_input_vov
+	Returns:
+		_type_: Distance matrix D, indices matrix I and tokens ids (using nb_ids)
+	"""
+	# nb_qi_ids = [] ##ordered ids list
+	nb_q_embds = [] ##ordered embeddings list
+	metadata = {}
+	qi_pos = 0
+	for key , token_values in vocab_queries.items():
+		# nb_qi_ids.append(token_values['token']) # for x in vocab_tokens]
+		metadata[qi_pos] = {'token': token_values['token'], 'text': token_values['text']}
+		qi_pos += 1
+		nb_q_embds.append(token_values['embed']) # for x in vocab_tokens]
+	xq = np.array(nb_q_embds).astype('float32') #elements to query
+	D,I = index.search(xq, topk)
+	return D,I, metadata
+def build_search(query_embeddings, model,type="input"):
+	global metadata_all
+	# ## biuld vocab for index
+	vocab_queries, sentence_tokens_list = create_vocab_multiple(query_embeddings, model)
+	words_vocab_queries, sentence_words_list = vocab_words_all_prefix(query_embeddings, model, sufix="@@",prefix="▁")
+	index_vor_tokens = metadata_all[type]['tokens'][1]
+	md_tokens = metadata_all[type]['tokens'][2]
+	D, I, meta = search_query_vocab_token(index_vor_tokens, vocab_queries)
+	qi_pos = 0
+	similar_tokens = {}
+	# similar_tokens = []
+	for dist, ind in zip(D,I):
+		try:
+			# similar_tokens.append({
+			similar_tokens[str(meta[qi_pos]['token'])] = {
+				'token': meta[qi_pos]['token'],
+				'text': meta[qi_pos]['text'],
+				# 'text': dict_tokenizer_tr[model].decode(meta[qi_pos]['token'])
+				# 'text': meta[qi_pos]['text'],
+				"similar_topk": [md_tokens[i_index]['token'] for i_index in ind if (i_index != -1) ],
+				"distance": [dist[i] for (i, i_index) in enumerate(ind) if (i_index != -1)],
+				}
+			# )
+		except:
+			print("\n ERROR ", qi_pos, dist, ind)
+		qi_pos += 1
+	index_vor_words = metadata_all[type]['words'][1]
+	md_words = metadata_all[type]['words'][2]
+	Dw, Iw, metaw = search_query_vocab(index_vor_words, words_vocab_queries)
+	# D, I, meta, vocab_words, sentence_words_list = result_input['words']# [2] # D ; I ; meta
+	qi_pos = 0
+	# similar_words = []
+	similar_words = {}
+	for dist, ind in zip(Dw,Iw):
+		try:
+			# similar_words.append({
+			similar_words[str(metaw[qi_pos]['word']) ] = {
+				'word': metaw[qi_pos]['word'],
+				'text': metaw[qi_pos]['word'],
+				"similar_topk": [md_words[i_index]['word'] for i_index in ind if (i_index != -1) ],
+				"distance": [dist[i] for (i, i_index) in enumerate(ind) if (i_index != -1)],
+				}
+			# )
+		except:
+			print("\n ERROR ", qi_pos, dist, ind)
+		qi_pos += 1
+	return {'tokens': {'D': D, 'I': I, 'meta': meta, 'vocab_queries': vocab_queries, 'similar':similar_tokens, 'sentence_key_list': sentence_tokens_list},
+			'words': {'D':Dw,'I': Iw, 'meta': metaw, 'vocab_queries':words_vocab_queries, 'sentence_key_list': sentence_words_list, 'similar': similar_words}
+			}
+def build_reference(all_embeddings, model):
+	# ## biuld vocab for index
+	vocab, sentence_tokens = create_vocab_multiple(all_embeddings,model)
+	words_vocab, sentences = vocab_words_all_prefix(all_embeddings, model, sufix="@@",prefix="▁")
+	index_tokens, meta_tokens = create_index_voronoi(vocab)
+	index_words, meta_words = create_index_voronoi_words(words_vocab)
+	return {'tokens': [vocab, index_tokens, meta_tokens],
+			'words': [words_vocab, index_words, meta_words]
+	 		} # , index, meta
+def embds_input_projection_vocab(vocab, key="token"):
+	t0 = time()
+	nb_ids = [] ##ordered ids list
+	nb_embds = [] ##ordered embeddings list
+	nb_text = [] ##ordered embeddings list
+	tnse_error = []
+	for _ , token_values in vocab.items():
+		tnse_error.append([0,0])
+		nb_ids.append(token_values[key]) # for x in vocab_tokens]
+		nb_text.append(token_values['text']) # for x in vocab_tokens]
+		nb_embds.append(token_values['embed']) # for x in vocab_tokens]
+	X = np.array(nb_embds).astype('float32') #elements to project
+	try:
+		tsne = TSNE(random_state=0, n_iter=1000)
+		tsne_results = tsne.fit_transform(X)
+		tsne_results = np.c_[tsne_results, nb_ids, nb_text, range(len(nb_ids))] ## creates a zip array : [[TNSE[X,Y], tokenid, token_text], ...]
+	except:
+		tsne_results = np.c_[tnse_error, nb_ids, nb_text, range(len(nb_ids))] ## creates a zip array : [[TNSE[X,Y], tokenid, token_text], ...]
+	t1 = time()
+	print("t-SNE: %.2g sec" % (t1 - t0))
+	print(tsne_results)
+	return tsne_results.tolist()
+def filtered_projection(similar_key, vocab, type="input", key="word"):
+	global metadata_all
+	vocab_proj = vocab.copy()
+	## tnse projection Input words
+	source_words_voc_similar = set()
+	# for words_set in similar_key:
+	for key_i in similar_key:
+		words_set = similar_key[key_i]
+		source_words_voc_similar.update(words_set['similar_topk'])
+	print(len(source_words_voc_similar))
+	# source_embeddings_filtered = {key:  metadata_all['input']['words'][0][key] for key in source_words_voc_similar}
+	source_embeddings_filtered = {key_value:  metadata_all[type][key][0][key_value] for key_value in source_words_voc_similar}
+	vocab_proj.update(source_embeddings_filtered)
+	## 	vocab_proj add
+	try:
+		result_TSNE = embds_input_projection_vocab(vocab_proj, key=key[:-1]) ## singular => without 's'
+		dict_projected_embds_all = {str(embds[2]): [embds[0], embds[1], embds[2], embds[3], embds[4]] for embds in result_TSNE}
+	except:
+		print('TSNE error', type, key)
+		dict_projected_embds_all = {}
+	# print(result_TSNE)
+	return dict_projected_embds_all
+def first_function(w1, model):
+	global metadata_all
+	#translate and get internal values
+	# print(w1)
+	sentences = w1.split("\n")
+	all_sentences = []
+	translated_text = ''
+	input_embeddings = []
+	output_embeddings = []
+	for sentence in sentences :
+		# print(sentence, end=";")
+		params = translation_model(sentence, model)
+		all_sentences.append(params)
+		# print(len(params))
+		translated_text +=  params[0] + ' \n'
+		input_embeddings.append({
+			'embeddings': params[3].detach(), ## create a vocabulary with the set of embeddings
+			'tokens': params[2].tolist(), # one translation = one sentence
+			# 'texts' : 	dict_tokenizer_tr[model].decode(params[2].tolist())
+		})
+		output_embeddings.append({
+			'embeddings' : params[4].detach(),
+			'tokens': params[1].sequences.tolist(),
+			# 'texts' : 	dict_tokenizer_tr[model].decode(params[1].sequences.tolist())
+		})
+	# print(input_embeddings)
+	# print(output_embeddings)
+	## Build FAISS index
+	# ---> preload faiss using the respective model with a initial dataset.
+	result_input = build_reference(input_embeddings,model)
+	result_output = build_reference(output_embeddings,model)
+	# print(result_input, result_output)
+	metadata_all = {'input': result_input, 'output': result_output}
+  ### get translation
+	return [translated_text, params]
+def first_function_tr(w1, model, var2={}):
+	global metadata_all
+	#Translate and find similar tokens in token
+	print("SEARCH -- ")
+	sentences = w1.split("\n")
+	all_sentences = []
+	translated_text = ''
+	input_embeddings = []
+	output_embeddings = []
+	for sentence in sentences :
+		# print(sentence, end=";")
+		params = translation_model(sentence, model)
+		all_sentences.append(params)
+		# print(len(params))
+		translated_text +=  params[0] + ' \n'
+		input_embeddings.append({
+			'embeddings': params[3].detach(), ## create a vocabulary with the set of embeddings
+			'tokens': params[2].tolist(), # one translation = one sentence
+			# 'texts' : dict_tokenizer_tr[model].decode(params[2].tolist()[0])
+		})
+		output_embeddings.append({
+			'embeddings' : params[4].detach(),
+			'tokens': params[1].sequences.tolist(),
+			# 'texts' : dict_tokenizer_tr[model].decode(params[1].sequences.tolist())
+		})
+	## Build FAISS index
+	# ---> preload faiss using the respective model with a initial dataset.
+	result_search = {}
+	result_search['input'] = build_search(input_embeddings, model, type='input')
+	result_search['output'] = build_search(output_embeddings, model, type='output')
+	# D, I, meta, vocab_words, sentence_words_list = result_input['words']# [2] # D ; I ; meta
+	# md = metadata_all['input']['words'][2]
+	# qi_pos = 0
+	# similar_words = []
+	# for dist, ind in zip(D,I):
+	# 	try:
+	# 		similar_words.append({
+	# 			'word': meta[qi_pos]['word'],
+	# 			"similar_topk": [md[i_index]['word'] for i_index in ind if (i_index != -1) ],
+	# 			"distance": [D[qi_pos][i] for (i, i_index) in enumerate(ind) if (i_index != -1)],
+	# 			})
+	# 	except:
+	# 		print("\n ERROR ", qi_pos, dist, ind)
+	# 	qi_pos += 1
+	# similar_vocab_queries = similar_vocab_queries[3]
+	# result_output = build_search(output_embeddings, model, type="output")
+	## {'tokens': {'D': D, 'I': I, 'meta': meta, 'vocab_queries': vocab_queries, 'similar':similar_tokens},
+##			'words': {'D':Dw,'I': Iw, 'meta': metaw, 'vocab_queries':words_vocab_queries, 'sentence_key_list': sentence_words_list, 'similar': similar_words}
+##			}
+	# print(result_input, result_output)
+	# json_out['input']['tokens'] = {	'similar_queries' : result_input['token'][5], # similarity and distance dict.
+	# 								'tnse': dict_projected_embds_all, #projected points (all)
+	# 								'key_text_list': result_input['token'][4], # current sentences keys
+	# 								}
+	json_out = {'input': {'tokens': {}, 'words': {}}, 'output': {'tokens': {}, 'words': {}}}
+	dict_projected = {}
+	for type in ['input', 'output']:
+		dict_projected[type] = {}
+		for key in ['tokens', 'words']:
+			similar_key = result_search[type][key]['similar']
+			vocab = result_search[type][key]['vocab_queries']
+			dict_projected[type][key] =  filtered_projection(similar_key, vocab, type=type, key=key)
+			json_out[type][key]['similar_queries'] = similar_key
+			json_out[type][key]['tnse'] = dict_projected[type][key]
+			json_out[type][key]['key_text_list'] = result_search[type][key]['sentence_key_list']
+	return [translated_text, [ json_out, json_out['output']['words'], json_out['output']['tokens']] ]
+from pathlib import Path
+## First create html and divs
+html = """
+<html>
+<script async src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js"></script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.0/jquery.min"></script>
+<script async data-require="[email protected]" data-semver="3.5.3"
+  src="//cdnjs.cloudflare.com/ajax/libs/d3/3.5.3/d3.js"></script>
+<body>
+  <div id="select_div">
+    <select id="select_type" class="form-select" aria-label="select example" hidden>
+      <option selected value="words">Words</option>
+      <option value="tokens">Tokens</option>
+    </select>
+  </div>
+  <div id="d3_embed_div">
+    <div class="row">
+      <div class="col-6">
+        <div id="d3_embeds_input_words" class="d3_embed words"></div>
+      </div>
+      <div class="col-6">
+        <div id="d3_embeds_output_words" class="d3_embed words"></div>
+      </div>
+      <div class="col-6">
+        <div id="d3_embeds_input_tokens" class="d3_embed tokens"></div>
+      </div>
+      <div class="col-6">
+        <div id="d3_embeds_output_tokens" class="d3_embed tokens"></div>
+      </div>
+    </div>
+  </div>
+  <div id="d3_graph_div">
+    <div class="row">
+      <div class="col-4">
+        <div id="d3_graph_input_words" class="d3_graph words"></div>
+      </div>
+	  <div class="col-4">
+	    <div id="similar_input_words" class=""></div>
+    </div>
+	  <div class="col-4">
+        <div id="d3_graph_output_words" class="d3_graph words"></div>
+        <div id="similar_output_words" class="d3_graph words"></div>
+      </div>
+	  </div>
+	  <div class="row">
+      <div class="col-6">
+        <div id="d3_graph_input_tokens" class="d3_graph tokens"></div>
+        <div id="similar_input_tokens" class="d3_graph tokens"></div>
+      </div>
+      <div class="col-6">
+        <div id="d3_graph_output_tokens" class="d3_graph tokens"></div>
+        <div id="similar_output_tokens" class="d3_graph tokens"></div>
+      </div>
+    </div>
+  </div>
+</body>
+</html>
+"""
+html0 =  """
+<html>
+<script async src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js"></script>
+<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.0/jquery.min"></script>
+<script async data-require="[email protected]" data-semver="3.5.3"
+  src="//cdnjs.cloudflare.com/ajax/libs/d3/3.5.3/d3.js"></script>
+<body>
+  <div id="select_div">
+    <select id="select_type" class="form-select" aria-label="select example" hidden>
+      <option selected value="words">Words</option>
+      <option value="tokens">Tokens</option>
+    </select>
+  </div>
+</body>
+</html>
+"""
+html_col1 = """
+      <div id="d3_graph_input_words" class="d3_graph words"></div>
+      <div id="d3_graph_input_tokens" class="d3_graph tokens"></div>
+	"""
+html_col2 = """
+ <div id="similar_input_words" class=""></div>
+  <div id="similar_output_words" class=""></div>
+  <div id="similar_input_tokens" class=" "></div>
+<div id="similar_output_tokens" class=" "></div>
+"""
+html_col3 = """
+<div id="d3_graph_output_words" class="d3_graph words"></div>
+<div id="d3_graph_output_tokens" class="d3_graph tokens"></div>
+"""
+# # <div class="row">
+#             <div class="col-6" id="d3_legend_data_source"> </div>
+#             <div class="col-6" id="d3_legend_similar_source"> </div>
+# </div>
+def second_function(w1,j2):
+	#  json_value = {'one':1}#  return f"{w1['two']} in sentence22..."
+	# to transfer the data to json.
+	print("second_function -- after the js", w1,j2)
+	return "transition to second js function finished."
+paths = []
+def save_index(model) :
+	names = []
+	with open(model + '_metadata_ref.pkl', 'wb') as f:
+		pickle.dump(metadata_all, f)
+		names.append(model + '_metadata_ref.pkl')
+	for type in ['tokens','words']:
+		for kind in ['input', 'output']:
+			## save index file
+			name = model + "_" + kind + "_"+ type + ".index"
+			write_index(metadata_all[kind][type][1], name)
+			names.append(name)
+	print("in save index done")
+	return gr.File(names)
+with gr.Blocks(js="plotsjs.js") as demo:
+	gr.Markdown(
+	"""
+	# MAKE NMT Workshop \t `Embeddings representation`
+	""")
+	with gr.Row():
+		with gr.Column(scale=1):
+			model_radio_c = gr.Radio(choices=['en-es', 'en-zh', 'en-fr'], value="en-es", label= '', container=False)
+		with gr.Column(scale=2):
+			gr.Markdown(
+				"""
+				### Reference Translation Sentences
+				Enter at least 50 sentences to be used as comparison.
+				This is submitted just once.
+				""")
+			in_text = gr.Textbox(lines=2, label="reference source text")
+			out_text  = gr.Textbox(label="reference target text", interactive=False)
+			out_text2  = gr.Textbox(visible=False)
+			var2 = gr.JSON(visible=False)
+			btn = gr.Button("Reference Translation")
+			# save_index_btn = gr.Button("Download reference index")
+			# file_obj = gr.File(label="Input File")
+			# input = file_obj
+			save_index_btn = gr.Button("Generate index files to download ",)
+			tab2_outputs = gr.File()
+			input = tab2_outputs
+			# save_output = gr.Button("Download", link="/file=en-es_input_tokens.index")
+		with gr.Column(scale=3):
+			gr.Markdown(
+				"""
+				### Translation Sentences
+				Sentences to be analysed.
+				""")
+			in_text_tr = gr.Textbox(lines=2, label="source text")
+			out_text_tr  = gr.Textbox(label="target text", interactive=False)
+			out_text2_tr  = gr.Textbox(visible=False)
+			var2_tr = gr.JSON(visible=False)
+			btn_faiss= gr.Button("Translation ")
+			gr.Button("Download", link="/file=en-es_input_tokens.index")
+	with gr.Row():
+	# input_mic = gr.HTML(html)
+		with gr.Column(scale=1):
+			input_mic = gr.HTML(html0)
+			input_html2 = gr.HTML(html_col2)
+		with gr.Column(scale=2):
+			input_html1 = gr.HTML(html_col1)
+		# with gr.Column(scale=2):
+		with gr.Column(scale=2):
+			input_html3 = gr.HTML(html_col3)
+	## first function input w1, model ; return out_text, var2; it does first function and js;
+	btn.click(first_function, [in_text, model_radio_c], [out_text,var2], js="(in_text,model_radio_c) => testFn_out(in_text,model_radio_c)") #should return an output comp.
+	btn_faiss.click(first_function_tr, [in_text_tr, model_radio_c], [out_text_tr,var2_tr], js="(in_text_tr,model_radio_c) => testFn_out(in_text_tr,model_radio_c)") #should return an output comp.
+	## second function input out_text(returned in first_function), [json]var2(returned in first_function) ;
+	## second function returns out_text2, var2; it does second function and js(with the input params);
+	out_text.change(second_function, [out_text, var2], out_text2, js="(out_text,var2) => testFn_out_json(var2)") #
+	out_text_tr.change(second_function, [out_text_tr, var2_tr], out_text2_tr, js="(out_text_tr,var2_tr) => testFn_out_json_tr(var2_tr)") #
+	save_index_btn.click(save_index, [model_radio_c], [tab2_outputs])
+	# tab2_submit_button.click(func2,
+	#                     inputs=tab2_inputs,
+	#                     outputs=tab2_outputs)
+	# run script function on load,
+	# demo.load(None,None,None,js="plotsjs.js")
+# allowed_paths
+if __name__ == "__main__":
+    demo.launch(allowed_paths=["./", ".", "/"])

plotsjs.js ADDED Viewed

	@@ -0,0 +1,744 @@

+async () => {
+	// set testFn() function on globalThis, so you html onlclick can access it
+	 globalThis.testFn = () => {
+	   document.getElementById('demo').innerHTML = "Hello?"
+	 };
+	 const d3 = await import("https://cdn.jsdelivr.net/npm/d3@7/+esm");
+	//  const d3 = await import("https://cdn.jsdelivr.net/npm/d3@5/+esm");
+	 const $ = await import("https://cdn.jsdelivr.net/npm/[email protected]/dist/jquery.min.js");
+	 globalThis.$ = $;
+	 globalThis.d3 = d3;
+	 globalThis.d3Fn = () => {
+		 d3.select('#viz').append('svg')
+				 .append('rect')
+				 .attr('width', 50)
+				 .attr('height', 50)
+				 .attr('fill', 'black')
+				 .on('mouseover', function(){d3.select(this).attr('fill', 'red')})
+				 .on('mouseout', function(){d3.select(this).attr('fill', 'black')});
+	 };
+	 globalThis.testFn_out = (val,model_radio_c) => {
+		// document.getElementById('demo').innerHTML = val
+		console.log(val, "testFn_out");
+		// globalThis.d3Fn();
+		return([val,model_radio_c]);
+	  };
+	  globalThis.testFn_out_json = (data) => {
+		console.log(data, "testFn_out_json --");
+		// var $ = jQuery;
+		// console.log( d3.select('#d3_embeddings'));
+		return(['string', {}])
+	}
+	  globalThis.testFn_out_json_tr = (data) => {
+		// data['input|output']['words|tokens']
+		console.log(data, "testFn_out_json_tr new");
+		var $ = jQuery;
+		console.log("$('#d3_embeddings')");
+		console.log($('#d3_embeddings'));
+		// d3.select('#d3_embeddings').html("");
+		d3.select("#d3_embeds_source").html("here");
+		// words or token visualization ?
+		console.log(d3.select("#select_type").node().value);
+		d3.select("#select_type").attr("hidden", null);
+		d3.select("#select_type").on("change", change);
+		change();
+		// tokens
+		// network plots;
+		['input', 'output'].forEach(text_type => {
+			['tokens', 'words'].forEach(text_key => {
+				// console.log(type, key, data[0][text_type]);
+				data_i = data[0][text_type][text_key];
+				embeddings_network([], data_i['tnse'], data_i['similar_queries'], type=text_type +"_"+text_key, )
+			});
+		});
+		// data_proj = data['tsne'];  // it is not a dict.
+		// d3.select("#d3_embeds_" + type).html(scatterPlot(data_proj, data_sentences, dict_token_sentence_id, similar_vocab_queries, 'd3_embeds_'+type, type ));
+		// d3.select('#d3_embeddings').append(function(){return  Tree(root);});
+		// embeddings_network(data['source_tokens'], data['dict_projected_embds_all']['source'], data['similar_vocab_queries']['source'], "source")
+		// source
+		// embeddings_graph(data['dict_projected_embds_all'],source_tks_list, data['source_tokens'], data['similar_vocab_queries'], "source"); //, data['similar_text'], data['similar_embds']);
+		// target decision: all tokens ? or separeted by language ? hint: do not assume they share the same dict.
+		// embeddings_graph(data['dict_projected_embds_all'], translated_tks_text, translated_tks_ids_by_sent, data['similar_vocab_queries'], "target"); //, data['similar_text'], data['similar_embds']);
+		return(['string', {}])
+	}
+	function change() {
+		show_type = d3.select("#select_type").node().value;
+		// hide all
+		d3.selectAll(".d3_embed").attr("hidden",'');
+		d3.selectAll(".d3_graph").attr("hidden", '');
+		// show current type;
+		d3.select("#d3_embeds_input_" + show_type).attr("hidden", null);
+		d3.select("#d3_embeds_output_" + show_type).attr("hidden", null);
+		d3.select("#d3_graph_input_" + show_type).attr("hidden", null);
+		d3.select("#d3_graph_output_" + show_type).attr("hidden", null);
+	}
+    function embeddings_network(tokens_text, dict_projected_embds, similar_vocab_queries, type="source", ){
+		// tokens_text : not used;
+		// dict_projected_embds = tnse
+        console.log("Each token is a node; distance if in similar list", type );
+		console.log(tokens_text, dict_projected_embds, similar_vocab_queries);
+        // similar_vocab_queries_target[key]['similar_topk']
+        var nodes_tokens = {}
+        var nodeHash = {};
+        var nodes = []; // [{id: , label: }]
+        var edges = []; // [{source: , target: weight: }]
+        var edges_ids = []; // [{source: , target: weight: }]
+        // similar_vocab_queries {key: {similar_topk : [], distance : []}}
+        console.log('similar_vocab_queries', similar_vocab_queries);
+		prev_node = '';
+        for ([sent_token, value] of Object.entries(similar_vocab_queries)) {
+			// console.log('dict_projected_embds',sent_token, parseInt(sent_token), value, dict_projected_embds);
+            // sent_token = parseInt(sent_token); // Object.entries assumes key:string;
+            token_text = dict_projected_embds[sent_token][3]
+            if (!nodeHash[sent_token]) {
+                nodeHash[sent_token] = {id: sent_token, label: token_text, type: 'sentence', type_i: 0};
+                nodes.push(nodeHash[sent_token]);
+            }
+            sim_tokens = value['similar_topk']
+            dist_tokens = value['distance']
+            for (let index = 0; index < sim_tokens.length; index++) {
+                const sim = sim_tokens[index];
+                const dist = dist_tokens[index];
+                token_text_sim = dict_projected_embds[sim][3]
+                if (!nodeHash[sim]) {
+                    nodeHash[sim] = {id: sim, label: token_text_sim, type:'similar', type_i: 1};
+                    nodes.push(nodeHash[sim]);
+                }
+                edges.push({source: nodeHash[sent_token], target: nodeHash[sim], weight: dist});
+                edges_ids.push({source: sent_token, target: sim, weight: dist});
+            }
+			if (prev_node != '' ) {
+				edges.push({source: nodeHash[prev_node], target:nodeHash[sent_token], weight: 1});
+				edges_ids.push({source: prev_node, target: sent_token, weight: 1});
+			}
+			prev_node = sent_token;
+        }
+        console.log("TYPE", type, edges, nodes, edges_ids, similar_vocab_queries)
+        // d3.select('#d3_graph_input_tokens').html(networkPlot({nodes: nodes, links:edges}, similar_vocab_queries, div_type=type) );
+		// type +"_"+key
+		d3.select('#d3_graph_'+type).html("");
+		d3.select('#d3_graph_'+type).append(function(){return  networkPlot({nodes: nodes, links:edges}, similar_vocab_queries, dict_projected_embds,div_type=type);});
+        // $('#d3_embeds_network_target').html(networkPlot({nodes: nodes, links:edges}));
+        // $('#d3_embeds_network_'+type).html(etworkPlot({nodes: nodes, link:edges}));
+    }
+    function embeddings_graph(data, source_tokens_text_list, source_tokens, similar_vocab_queries, type="source") {
+        /*
+        ### source
+            data: dict_projected_embds_all = { token_id: [tns1, tns2, token_id, token_text] ...}
+        ### target
+        */
+        console.log("embeddings_graph");
+        active_sentences = get_sentences();
+        console.log("active_sentences", active_sentences, type); // working
+        active_sentences_tokens_text = active_sentences.map((x) => source_tokens_text_list[x]);
+        active_sentences_tokens = active_sentences.map((x) => source_tokens[x]);
+        console.log(active_sentences_tokens);
+        data_sentences = []
+        dict_token_sentence_id = {}
+        // active_sentences_tokens.forEach((sentence, i) => {
+		source_tokens_text_list.forEach((sentence, i) => {
+            /// opt1
+            proj = []
+            sentence.forEach((tok, tok_j) => {
+                console.log("tok,tok_j", tok, tok_j);
+                token_text = source_tokens_text_list[i][tok_j];
+                proj.push([data[tok][0], data[tok][1], token_text, i, tok_j, tok])
+                if (token_text in dict_token_sentence_id){
+                dict_token_sentence_id[token_text].push(i);
+                }
+                else{
+                    dict_token_sentence_id[token_text] = [i];
+                }
+            });
+            data_sentences.push(proj);
+        });
+        console.log("data_sentences error here in target", data_sentences);
+        console.log(data);
+        $('#d3_embeds_' + type).html(scatterPlot(data, data_sentences, dict_token_sentence_id, similar_vocab_queries, 'd3_embeds_'+type, type ));
+    }
+/*
+	data: dict_projected_embds_all = { token_id: [tns1, tns2, token_id, token_text] ...}
+*/
+function scatterPlot(data, data_sentences, dict_token_sentence_id, similar_vocab_queries, div_name, div_type="source", {
+	width = 400, // outer width, in pixels
+	height , // outer height, in pixels
+	r = 3, // radius of nodes
+	padding = 1, // horizontal padding for first and last column
+	// text = d => d[2],
+} = {}){
+	// data_dict = data[div_type];
+	var data_dict = { ...data[div_type] };
+	data = Object.values(data[div_type]);
+	// similar_vocab_queries = similar_vocab_queries[div_type];
+	var similar_vocab_queries = { ...similar_vocab_queries[div_type] };
+	console.log("div_type, data, data_dict, data_sentences, dict_token_sentence_id, similar_vocab_queries");
+	console.log(div_type, data, data_dict, data_sentences, dict_token_sentence_id, similar_vocab_queries);
+  // Create the SVG container.
+  var margin = {top: 10, right: 10, bottom: 30, left: 50 },
+    width = width - margin.left - margin.right,
+    height = 400 - margin.top - margin.bottom;
+// append the svg object to the body of the page
+var svg = d3.create("svg")
+	// .attr("style", "max-width: 100%; height: auto; height: intrinsic;")
+    .attr("width", width + margin.left + margin.right)
+    .attr("height", height + margin.top + margin.bottom)
+	svg.append("g")
+    .attr("transform",
+          "translate(" + margin.left + "," + margin.top + ")");
+//   const svg = d3.create("svg")
+//       .attr("width", width)
+//       .attr("height", height);
+	// Add X axis
+	min_value_x = d3.min(data, d => d[0])
+	max_value_x = d3.max(data, d => d[0])
+	var x = d3.scaleLinear()
+		.domain([min_value_x, max_value_x])
+		.range([ margin.left , width  ]);
+	svg.append("g")
+		// .attr("transform", "translate("+ margin.left +"," + height + ")")
+		.attr("transform", "translate(0," + height + ")")
+		.call(d3.axisBottom(x));
+	// Add Y axis
+	min_value_y = d3.min(data, d => d[1])
+	max_value_y = d3.max(data, d => d[1])
+	var y = d3.scaleLinear()
+		.domain([min_value_y, max_value_y])
+		.range([ height, margin.top]);
+	svg.append("g")
+		.attr("transform", "translate("+ margin.left +", 0)")
+		.call(d3.axisLeft(y));
+	svg.selectAll()
+	  .data(data)
+	  .enter()
+	  .append('circle')
+	  .attr("class", function (d) { return "dot-" + d[2] } )
+		// .attr("cx", function (d) { return x(d[0] + margin.left); } )
+		.attr("cx", function (d) { return x(d[0]); } )
+		.attr("cy", function (d) { return y(d[1] - margin.bottom); } )
+		.attr("r", 5)
+		.style("fill", "#e85252")
+		.style("fillOpacity",0.2)
+		.style("stroke", "#000000ff")
+		.style("strokeWidth", 1)
+		.style("opacity", 0.7);
+	// svg.selectAll()
+	// 	.data(data)
+	// 	.enter()
+	// 	.append('text')
+	// 	.text(d => d[3])
+	// 		.attr("class", function (d) { return "text-" + d[2] } )
+	// 		// .attr("cx", function (d) { return x(d[0] + margin.left); } )
+	// 		.attr("x", function (d) { return x(d[0]); } )
+	// 		.attr("y", function (d) { return y(d[1] - margin.bottom); } )
+	// 		.attr("dy", "0.35em");
+	// colors = ['#cb1dd1',"#e0ac2b", "#e85252", "#6689c6", "#9a6fb0", "#a53253"];
+	colors = ['#6689c6',"#e0ac2b", "#e0ac2b", "#cb1dd1", "#cb1dd1", "#cb1dd1"];
+	// create a tooltip
+	var Tooltip = d3.select("#"+div_name)
+					.append("div")
+					.style("opacity", 0)
+					.attr("class", "tooltip")
+					.style("background-color", "white")
+					.style("border", "solid")
+					.style("border-width", "2px")
+					.style("border-radius", "5px")
+					.style("padding", "5px")
+					.text("I'm a circle!");
+	// const colorScale = d3.scaleOrdinal()
+	// 					.domain(domain_values)
+	// 					.range(["#e0ac2b", "#e85252", "#6689c6", "#9a6fb0", "#a53253"]);
+	// colorScale(d.group)
+	for (let i_snt = 0; i_snt < data_sentences.length; i_snt++) {
+		const sentence = data_sentences[i_snt];
+		// similar_tokens;
+		console.log("sentence: ", sentence);
+		svg.selectAll()
+			.data(sentence)
+			.enter()
+			.append('text')
+			.text(d => d[2])
+			.attr("class", function (d) { return "text-" + d[2] + " sent-" + i_snt  } )
+				// .attr("cx", function (d) { return x(d[0] + margin.left); } )
+				.attr("x", function (d) { return x(d[0]); } )
+				.attr("y", function (d) { return y(d[1] - margin.bottom); } )
+				.attr("dy", "0.35em")
+				.attr("sentence_i", i_snt );
+		svg.selectAll()
+			.data(sentence)
+			.enter()
+			.append('circle')
+			.attr("class", function (d) { return "dot " + d[2] + " " + i_snt  } )
+				// .attr("cx", function (d) { return x(d[0] + margin.left); } )
+				.attr("cx", function (d) { return x(d[0]); } )
+				.attr("cy", function (d) { return y(d[1] - margin.bottom); } )
+				.attr("sentence_i", i_snt )
+				.attr("r", 6)
+				.style("fill", colors[0])
+				.style("fillOpacity",0.2)
+				.style("stroke", "#000000")
+				.style("strokeWidth", 1)
+				.style("opacity", 1)
+				.on('click', change_legend )
+				.on('mouseover', highlight_mouseover )
+				.on('mouseout', highlight_mouseout )
+				// .on("mousemove", mousemove);
+	}
+	function change_legend(d,i) {
+		console.log(d,i);
+		if (i[2] in dict_token_sentence_id){
+			show_sentences(dict_token_sentence_id[i[2]], i[2]);
+			show_similar_tokens(i[5], '#d3_legend_similar_'+type);
+			console.log(dict_token_sentence_id[i[2]]);
+		}
+		else{console.log("no sentence")};
+	}
+	function highlight_mouseover(d,i) {
+		console.log("highlight_mouseover", d,i);
+		// token_id = parseInt(i[5])
+		similar_ids = similar_vocab_queries[token_id]['similar_topk'];
+		d3.select(this).transition()
+						 .duration('50')
+						 .style('opacity', '1')
+						 .attr("r", 12)
+		similar_ids.forEach(similar_token => {
+			d3.selectAll('.dot-' + similar_token).attr("r",12 ).style('opacity', '1')//.raise()
+		});
+		Tooltip
+			.style("opacity", 1)
+				.style("visibility", "visible")
+				// .style("top", (event.pageY-height)+"px").style("left",(event.pageX-width)+"px")
+			d3.select(this)
+				.style("stroke", "red")
+				.attr("strokeWidth", 2)
+				.style("opacity", 0.7)
+			// .html("The exact value of<br>this cell is: ")
+			// .style("left", (d3.mouse(this)[0]+70) + "px")
+			// .style("top", (d3.mouse(this)[1]) + "px")
+	}
+	function highlight_mouseout(d,i) {
+		// token_id = parseInt(i[5])
+		console.log("similar_vocab_queries", similar_vocab_queries);
+		similar_ids = similar_vocab_queries[token_id]['similar_topk'];
+		// clean_sentences();
+		d3.select(this).transition()
+			.duration('50')
+			.style('opacity', '.7')
+			.attr("r", 6)
+		similar_ids.forEach(similar_token => {
+				d3.selectAll('.dot-' + similar_token).attr("r",6 ).style('opacity', '.7')
+			});
+		Tooltip
+			.style("opacity", 0)
+		  d3.select(this)
+			.style("stroke", "none")
+			.style("opacity", 0.8)
+	}
+	function mousemove(d,i) {
+		console.log("mousemove", d, i)
+		pointer = d3.pointer(d);
+		Tooltip
+			.html("The exact value of<br> ")
+			// .style("top", ((e.pageY ) - (height*2)) +"px")
+			// .attr("transform", `translate(${pointer[0]},0)`)
+			.style("top", height - pointer[1] +"px")
+			.style("left", pointer[0]+"px")
+	}
+	function show_sentences(sentences_id, token) {
+		// Show sentences with token "token"
+		d3.select('#d3_legend_data_'+div_type).html("");
+		console.log("show_sentences", data_sentences, sentences_id);
+		sentences_id.forEach(sent_id => {
+			console.log(data_sentences[sent_id])
+			// console.log(data_sentences[sent_id].map( x => x[2] ));
+			// p = d3.select('#d3_legend_data').append("p").enter();
+			d3.select('#d3_legend_data_'+div_type)
+				.selectAll().append("p")
+				.data(data_sentences[sent_id])
+				.enter()
+				.append('text')
+				.attr('class_data', sent_id)
+				.attr('class_id', d => d[5])
+				.style("background", d=> {if (d[2]== token) return "yellow"} )
+				.text( d => d[2] + " ");
+			d3.select('#d3_legend_data_'+div_type).append("p").enter();
+		});
+		// $("#d3_legend_data")
+		// data_sentences
+	}
+	function clean_sentences() {
+		d3.select('#d3_legend_data_'+div_type).html("");
+	}
+	function show_similar_tokens(token, div_name_similar= '#d3_legend_similar_') {
+		d3.select(div_name_similar).html("");
+		console.log("token", token);
+		console.log("similar_vocab_queries[token]", similar_vocab_queries[token]);
+		token_data = similar_vocab_queries[token];
+		console.log(token, token_data);
+		var decForm = d3.format(".3f");
+		d3.select(div_name_similar)
+			.selectAll().append("p")
+			.data(token_data['similar_topk'])
+			.enter()
+			.append("p").append('text')
+			// .attr('class_data', sent_id)
+			.attr('class_id', d => d)
+			.style("background", d=> {if (d == token) return "yellow"} )
+			// .text( d => d  + " \n ");
+			.text((d,i) => do_text(d,i) );
+		function do_text(d,i){
+			console.log("do_text d,i" );
+			console.log(d,i);
+			console.log("data_dict[d], data_dict");
+			// console.log(data_dict[d], data_dict);
+			// return data_dict[d][3] + " " + decForm(token_data['distance'][i]) + "   ";
+			return " " + decForm(token_data['distance'][i]) + "   ";
+		}
+	}
+		// data_sentences
+	//   .attr('x', (d) => x_scale(d[0]) + margin.left)
+	//   .attr('y', (d) => y_scale(d[1]) + margin_top_extra)
+	//   .attr("rx", 4)
+	//   .attr("ry", 4)
+	//   .attr("stroke", "#F7F7F7")
+	//   .attr("stroke-width","2px")
+	//   .attr('width', x_scale.bandwidth())
+	//   .attr('height', (d) => height_text);
+	// //   .attr('fill', (d) => color_scale(d.value));
+	// Add dots
+	// svg.append('g')
+	// // .selectAll("dot")
+	// .data(data)
+	// .enter()
+	// .append("circle")
+	// 	.attr("class", function (d) { return "dot " + d[2] } )
+	// 	.attr("cx", function (d) { return x(d[0]); } )
+	// 	.attr("cy", function (d) { return y(d[1]); } )
+		// .attr("r", 5)
+		// .style("fill", function (d) { return color(d.Species) } )
+	// .on("mouseover", highlight)
+	// .on("mouseleave", doNotHighlight )
+	return svg.node();
+}
+function networkPlot(data, similar_vocab_queries,dict_proj, div_type="source", {
+	width = 400, // outer width, in pixels
+	height , // outer height, in pixels
+	r = 3, // radius of nodes
+	padding = 1, // horizontal padding for first and last column
+	// text = d => d[2],
+} = {}){
+	// data_dict = data;
+	data = data// [div_type];
+	similar_vocab_queries = similar_vocab_queries// [div_type];
+	console.log("data, similar_vocab_queries, div_type");
+	console.log(data, similar_vocab_queries, div_type);
+  // Create the SVG container.
+  var margin = {top: 10, right: 10, bottom: 30, left: 50 },
+    width = width //- margin.left - margin.right,
+    height = 400 //- margin.top - margin.bottom;
+	width_box = width + margin.left + margin.right;
+	height_box = height + margin.top + margin.bottom
+	totalWidth = width*2;
+	// append the svg object to the body of the page
+	// const parent = d3.create("div");
+	// const body = parent.append("div")
+	// .style("overflow-x", "scroll")
+	// .style("-webkit-overflow-scrolling", "touch");
+	var svg = d3.create("svg")
+	// var svg = body.create("svg")
+	// .style("display", "block")
+	// .attr("style", "max-width: 100%; height: auto; height: intrinsic;")
+    .attr("width", width + margin.left + margin.right)
+    .attr("height", height + margin.top + margin.bottom)
+	// .attr("viewBox", [-width_box / 2, -height_box / 2, width_box, height_box])
+    //   .attr("viewBox", [0, 0, width, height]);
+	// .attr("style", "max-width: 100%; height: auto;");
+	// svg.append("g")
+    // .attr("transform",
+    //       "translate(" + margin.left + "," + margin.top + ")");
+	// Initialize the links
+	var link = svg
+		.selectAll("line")
+		.data(data.links)
+		.enter()
+		.append("line")
+		.style("fill", d => d.weight == 1 ? "#dfd5d5" :  "#000000") // , "#69b3a2" : "#69b3a2")
+		.style("stroke", "#aaa")
+	var text = svg
+		.selectAll("text")
+		.data(data.nodes)
+		.enter()
+		.append("text")
+  		.style("text-anchor", "middle")
+  		.attr("y", 15)
+		.attr("class", d => 'text_token-'+ dict_proj[d.id][4] + div_type)
+		.attr("div-type", div_type)
+		// .attr("class", d => 'text_token-'+ d.index)
+  		.text(function (d) {return d.label} )
+		//   .on('mouseover', function(d) { (d.type_i == 0) ? highlight_mouseover_text : console.log(0)} )
+		//   .on('mouseover', function(d) { (d.type_i == 0) ? highlight_mouseout_text : '' } )
+		//   .on('mouseout', highlight_mouseout_text )
+		// .join('text')
+		// .text(function(d) {
+			// return d.id
+		// })
+	// Initialize the nodes
+	var node = svg
+		.selectAll("circle")
+		.data(data.nodes)
+		.enter()
+		.append("circle")
+		.attr("r", 6)
+		// .attr("class", d => 'node_token-'+ d.id)
+		.attr("class", d => 'node_token-'+ dict_proj[d.id][4] + div_type)
+		.attr("div-type", div_type)
+		.style("fill", d => d.type_i ? "#e85252" :  "#6689c6") // , "#69b3a2" : "#69b3a2")
+		.on('mouseover', highlight_mouseover )
+		// .on('mouseover', function(d) { return (d.type_i == 0) ? highlight_mouseover : console.log(0)} )
+		.on('mouseout',highlight_mouseout )
+		.on('click', change_legend )
+		// .on('click', show_similar_tokens )
+	// Let's list the force we wanna apply on the network
+	var simulation = d3.forceSimulation(data.nodes)                 // Force algorithm is applied to data.nodes
+		.force("link", d3.forceLink()                               // This force provides links between nodes
+		.id(function(d) { return d.id; })                     // This provide  the id of a node
+		.links(data.links)                                    // and this the list of links
+		)
+		.force("charge", d3.forceManyBody(-400))         // This adds repulsion between nodes. Play with the -400 for the repulsion strength
+		.force("center", d3.forceCenter(width / 2, height / 2))     // This force attracts nodes to the center of the svg area
+		// .force("collision", d3.forceCollide())
+		.on("end", ticked);
+	// This function is run at each iteration of the force algorithm, updating the nodes position.
+	function ticked() {
+		link
+			.attr("x1", function(d) { return d.source.x; })
+			.attr("y1", function(d) { return d.source.y; })
+			.attr("x2", function(d) { return d.target.x; })
+			.attr("y2", function(d) { return d.target.y; });
+		node
+			.attr("cx", function (d) { return d.x+3; })
+			.attr("cy", function(d) { return d.y-3; });
+		text
+			.attr("transform", function(d) { return "translate(" + d.x + "," + d.y + ")"; })
+	}
+	function highlight_mouseover(d,i) {
+		console.log("highlight_mouseover", d,i, d3.select(this).attr("div-type"));
+		if (i.type_i == 0 ){
+				token_id = i.id
+				similar_ids = similar_vocab_queries[token_id]['similar_topk'];
+				d3.select(this).transition()
+								.duration('50')
+								.style('opacity', '1')
+								.attr("r", 12)
+				type = d3.select(this).attr("div-type")
+				similar_ids.forEach(similar_token => {
+					node_id_name = dict_proj[similar_token][4]
+					d3.selectAll('.node_token-'+ node_id_name + type).attr("r",12 ).style('opacity', '1')//.raise()
+					// d3.selectAll('.text_token-'+ node_id_name).raise()
+				});
+			}
+	}
+	function highlight_mouseout(d,i) {
+		if (i.type_i == 0 ){
+			token_id = i.id
+			console.log("similar_vocab_queries", similar_vocab_queries, "this type:", d3.select(this).attr("div-type"));
+			similar_ids = similar_vocab_queries[token_id]['similar_topk'];
+			// clean_sentences();
+			d3.select(this).transition()
+				.duration('50')
+				.style('opacity', '.7')
+				.attr("r", 6)
+			type = d3.select(this).attr("div-type")
+			similar_ids.forEach(similar_token => {
+					node_id_name = dict_proj[similar_token][4]
+					d3.selectAll('.node_token-' + node_id_name + type).attr("r",6 ).style('opacity', '.7')
+					d3.selectAll("circle").raise()
+				});
+		}
+	}
+	function change_legend(d,i,j) {
+		console.log(d,i,dict_proj);
+		if (i['id'] in dict_proj){
+			// show_sentences(dict_proj[i[2]], i[2]);
+			show_similar_tokens(i['id'], '#similar_'+type);
+			console.log(dict_proj[i['id']]);
+		}
+		else{console.log("no sentence")};
+	}
+	function show_similar_tokens(token, div_name_similar='#similar_input_tokens') {
+		d3.select(div_name_similar).html("");
+		console.log("token", token);
+		console.log("similar_vocab_queries[token]", similar_vocab_queries[token]);
+		token_data = similar_vocab_queries[token];
+		console.log(token, token_data);
+		var decForm = d3.format(".3f");
+		d3.select(div_name_similar)
+			.selectAll().append("p")
+			.data(token_data['similar_topk'])
+			.enter()
+			.append("p").append('text')
+			// .attr('class_data', sent_id)
+			.attr('class_id', d => d)
+			.style("background", d=> {if (d == token) return "yellow"} )
+			// .text( d => d  + " \n ");
+			.text((d,i) => do_text(d,i) );
+		function do_text(d,i){
+			console.log("do_text d,i" );
+			console.log(d,i);
+			console.log("data_dict[d], data_dict");
+			console.log(dict_proj[d], dict_proj);
+			return dict_proj[d][3] + " " + decForm(token_data['distance'][i]) + "   ";
+		}
+	}
+	// svg.call(d3.zoom()
+	// 	.extent([[0, 0], [width, height]])
+	// 	.scaleExtent([1, 8])
+	// 	.on("zoom", zoomed));
+	// function zoomed({transform}) {
+	// 	circle.attr("transform", d => `translate(${transform.apply(d)})`);
+	// }
+	// svg.call(
+	// 	d3.zoom().on("zoom", (event) => {
+	// 	  g.attr("transform", event.transform);
+	// 	})
+	//   );
+	// body.node().scrollBy(totalWidth, 0);
+	return svg.node();
+	// return parent.node();
+	};
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+inseq
+bertviz
+jupyter
+scikit-learn
+faiss-cpu