Gabriela Nicole Gonzalez Saez commited on
Commit
32fdb6f
1 Parent(s): e4bccbf

zh-decode and best probabilities

Browse files
Files changed (1) hide show
  1. app.py +18 -13
app.py CHANGED
@@ -78,19 +78,18 @@ def get_k_prob_tokens(transition_scores, result, model, k_values=5):
78
  gen_sequences = result.sequences[:, 1:]
79
 
80
  result_output = []
81
- # bs_alt = []
82
- # bs_alt_scores = []
83
 
84
  # First beam only...
85
  bs = 0
86
  text = ' '
87
  for tok, score, i_step in zip(gen_sequences[bs], transition_scores[bs],range(len(gen_sequences[bs]))):
88
- # bs_alt.append([tokenizer_tr.decode(tok) for tok in result.scores[i_step][bs].topk(k_values).indices ] )
89
- # bs_alt_scores.append(np.exp(result.scores[i_step][bs].topk(k_values).values))
90
 
91
- bs_alt = [tokenizer_tr.decode(tok) for tok in result.scores[i_step][bs].topk(k_values).indices ]
92
- bs_alt_scores = np.exp(result.scores[i_step][bs].topk(k_values).values)
93
- result_output.append([np.array(result.scores[i_step][bs].topk(k_values).indices), np.array(bs_alt_scores),bs_alt])
 
 
 
94
 
95
  return result_output
96
 
@@ -100,15 +99,19 @@ def split_token_from_sequences(sequences, model) -> dict :
100
 
101
  gen_sequences_texts = []
102
  for bs in range(n_sentences):
 
103
  #### decoder per token.
104
- gen_sequences_texts.append(dict_tokenizer_tr[model].decode(sequences[:, 1:][bs], skip_special_tokens=True).split(' '))
105
- print(gen_sequences_texts)
106
- score = 0
 
 
107
 
 
108
  #raw dict is bos
109
  text = 'bos'
110
  new_id = text +'--1'
111
- dict_parent = [{'id': new_id, 'parentId': None , 'text': text, 'name': 'bos', 'prob':score }]
112
  id_dict_pos = {}
113
  step_i = 0
114
  cont = True
@@ -151,8 +154,10 @@ def split_token_from_sequences(sequences, model) -> dict :
151
  dict_parent.append({'id': new_id, 'parentId': parent_id , 'text': step_w, 'name': step_w, 'prob' : score })
152
  id_dict_pos[new_id] = len(dict_parent) - 1
153
  else:
154
- dict_parent.append({'id': new_id, 'parentId': parent_id , 'text': step_w, 'name': step_w, 'prob' : score })
155
- id_dict_pos[new_id] = len(dict_parent) - 1
 
 
156
 
157
  step_i += 1
158
  return dict_parent
 
78
  gen_sequences = result.sequences[:, 1:]
79
 
80
  result_output = []
 
 
81
 
82
  # First beam only...
83
  bs = 0
84
  text = ' '
85
  for tok, score, i_step in zip(gen_sequences[bs], transition_scores[bs],range(len(gen_sequences[bs]))):
 
 
86
 
87
+ beam_i = result.beam_indices[0][i_step]
88
+ if beam_i < 0:
89
+ beam_i = bs
90
+ bs_alt = [tokenizer_tr.decode(tok) for tok in result.scores[i_step][beam_i].topk(k_values).indices ]
91
+ bs_alt_scores = np.exp(result.scores[i_step][beam_i].topk(k_values).values)
92
+ result_output.append([np.array(result.scores[i_step][beam_i].topk(k_values).indices), np.array(bs_alt_scores),bs_alt])
93
 
94
  return result_output
95
 
 
99
 
100
  gen_sequences_texts = []
101
  for bs in range(n_sentences):
102
+ # gen_sequences_texts.append(dict_tokenizer_tr[model].decode(sequences[:, 1:][bs], skip_special_tokens=True).split(' '))
103
  #### decoder per token.
104
+ seq_bs = []
105
+
106
+ for token in sequences[:, 1:][bs]:
107
+ seq_bs.append(dict_tokenizer_tr[model].decode(token, skip_special_tokens=True))
108
+ gen_sequences_texts.append(seq_bs)
109
 
110
+ score = 0
111
  #raw dict is bos
112
  text = 'bos'
113
  new_id = text +'--1'
114
+ dict_parent = [{'id': new_id, 'parentId': None , 'text': text, 'name': 'bos', 'prob': score }]
115
  id_dict_pos = {}
116
  step_i = 0
117
  cont = True
 
154
  dict_parent.append({'id': new_id, 'parentId': parent_id , 'text': step_w, 'name': step_w, 'prob' : score })
155
  id_dict_pos[new_id] = len(dict_parent) - 1
156
  else:
157
+ if not (new_id in id_dict_pos):
158
+ dict_parent.append({'id': new_id, 'parentId': parent_id , 'text': step_w, 'name': step_w, 'prob' : score })
159
+ id_dict_pos[new_id] = len(dict_parent) - 1
160
+
161
 
162
  step_i += 1
163
  return dict_parent