gregH commited on
Commit
5cbabc6
1 Parent(s): 2b6d138

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -5
app.py CHANGED
@@ -151,7 +151,7 @@ def gradient_cuff_reject(message,with_defense, sample_times,perturb_times,thresh
151
  results=[None for _ in range(perturb_times+1)]
152
  results[0]=sum(labels)/len(labels)
153
  if results[0]>0.5:
154
- return True
155
 
156
  if perturb_times>0:
157
  # second-stage rejection
@@ -176,15 +176,18 @@ def gradient_cuff_reject(message,with_defense, sample_times,perturb_times,thresh
176
  est_grad=[(results[j+1]-results[0])/0.02*shift_direction_embedding[j] for j in range(perturb_times)]
177
  est_grad=sum(est_grad)/len(est_grad)
178
  if est_grad.norm().item()>threshold:
179
- return True
180
- return False
181
 
182
  def chat(message, history, with_defense,threshold):
183
  perturb_times=9
184
  sample_times=10
185
  #threshold=thresholds[perturb_times-1]
186
- if gradient_cuff_reject(message,with_defense, sample_times, perturb_times, threshold):
187
- answer="[Gradient Cuff Rejection] I cannot fulfill your request".split(" ")
 
 
 
188
  partial_text = ""
189
  for new_text in answer:
190
  partial_text += (new_text+" ")
 
151
  results=[None for _ in range(perturb_times+1)]
152
  results[0]=sum(labels)/len(labels)
153
  if results[0]>0.5:
154
+ return (True,results[0],None)
155
 
156
  if perturb_times>0:
157
  # second-stage rejection
 
176
  est_grad=[(results[j+1]-results[0])/0.02*shift_direction_embedding[j] for j in range(perturb_times)]
177
  est_grad=sum(est_grad)/len(est_grad)
178
  if est_grad.norm().item()>threshold:
179
+ return (True,results[0],est_grad.norm().item())
180
+ return (False,None,None)
181
 
182
  def chat(message, history, with_defense,threshold):
183
  perturb_times=9
184
  sample_times=10
185
  #threshold=thresholds[perturb_times-1]
186
+ return_value=gradient_cuff_reject(message,with_defense, sample_times, perturb_times, threshold)
187
+ if return_value[0]:
188
+ reject_information=json.dumps({'refusal_loss':1-return_value[1],'gradient_norm':return_value[2]})
189
+ answer="Gradient Cuff Rejection: "+reject_information
190
+ answer=answer.split(" ")
191
  partial_text = ""
192
  for new_text in answer:
193
  partial_text += (new_text+" ")