gregH commited on
Commit
b65f837
1 Parent(s): 5cbabc6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -6
app.py CHANGED
@@ -137,7 +137,7 @@ def gradient_cuff_reject(message,with_defense, sample_times,perturb_times,thresh
137
 
138
  # first-stage rejection
139
  if not with_defense:
140
- return False
141
  sft_embed=shift_direction_embedding[0]*0.0
142
  original_input_id=tok.encode(message,return_tensors="pt",add_special_tokens=False)[0]
143
  original_embedding=embedding_func(original_input_id)
@@ -177,15 +177,15 @@ def gradient_cuff_reject(message,with_defense, sample_times,perturb_times,thresh
177
  est_grad=sum(est_grad)/len(est_grad)
178
  if est_grad.norm().item()>threshold:
179
  return (True,results[0],est_grad.norm().item())
180
- return (False,None,None)
181
 
182
  def chat(message, history, with_defense,threshold):
183
  perturb_times=9
184
  sample_times=10
185
  #threshold=thresholds[perturb_times-1]
186
  return_value=gradient_cuff_reject(message,with_defense, sample_times, perturb_times, threshold)
 
187
  if return_value[0]:
188
- reject_information=json.dumps({'refusal_loss':1-return_value[1],'gradient_norm':return_value[2]})
189
  answer="Gradient Cuff Rejection: "+reject_information
190
  answer=answer.split(" ")
191
  partial_text = ""
@@ -203,12 +203,12 @@ def chat(message, history, with_defense,threshold):
203
  messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
204
  # Tokenize the messages string
205
  input_ids = tok([messages], return_tensors="pt")["input_ids"]
206
- response= chat_engine(input_ids).split(" ")
 
207
 
208
  # Initialize an empty string to store the generated text
209
  partial_text = ""
210
- response_split=response.split(" ")
211
- for new_text in response_split:
212
  partial_text += (new_text+" ")
213
  yield partial_text
214
 
 
137
 
138
  # first-stage rejection
139
  if not with_defense:
140
+ return (False,None,None)
141
  sft_embed=shift_direction_embedding[0]*0.0
142
  original_input_id=tok.encode(message,return_tensors="pt",add_special_tokens=False)[0]
143
  original_embedding=embedding_func(original_input_id)
 
177
  est_grad=sum(est_grad)/len(est_grad)
178
  if est_grad.norm().item()>threshold:
179
  return (True,results[0],est_grad.norm().item())
180
+ return (False,results[0],est_grad.norm().item())
181
 
182
  def chat(message, history, with_defense,threshold):
183
  perturb_times=9
184
  sample_times=10
185
  #threshold=thresholds[perturb_times-1]
186
  return_value=gradient_cuff_reject(message,with_defense, sample_times, perturb_times, threshold)
187
+ reject_information=json.dumps({'refusal_loss':1-return_value[1],'gradient_norm':return_value[2]})
188
  if return_value[0]:
 
189
  answer="Gradient Cuff Rejection: "+reject_information
190
  answer=answer.split(" ")
191
  partial_text = ""
 
203
  messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
204
  # Tokenize the messages string
205
  input_ids = tok([messages], return_tensors="pt")["input_ids"]
206
+ response= "Gradient Cuff Checking: "+reject_information + "\n"+ chat_engine(input_ids)
207
+ response=response.split(" ")
208
 
209
  # Initialize an empty string to store the generated text
210
  partial_text = ""
211
+ for new_text in response:
 
212
  partial_text += (new_text+" ")
213
  yield partial_text
214