Spaces:

gregH
/

gradient_cuff

Running on Zero

gregH commited on 25 days ago

Commit

5cbabc6

•

1 Parent(s): 2b6d138

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -151,7 +151,7 @@ def gradient_cuff_reject(message,with_defense, sample_times,perturb_times,thresh
     results=[None for _ in range(perturb_times+1)]
     results[0]=sum(labels)/len(labels)
     if results[0]>0.5:
-        return True
     if perturb_times>0:
         # second-stage rejection
@@ -176,15 +176,18 @@ def gradient_cuff_reject(message,with_defense, sample_times,perturb_times,thresh
         est_grad=[(results[j+1]-results[0])/0.02*shift_direction_embedding[j] for j in range(perturb_times)]
         est_grad=sum(est_grad)/len(est_grad)
         if est_grad.norm().item()>threshold:
-            return True
-    return False
 def chat(message, history, with_defense,threshold):
     perturb_times=9
     sample_times=10
     #threshold=thresholds[perturb_times-1]
-    if gradient_cuff_reject(message,with_defense, sample_times, perturb_times, threshold):
-        answer="[Gradient Cuff Rejection] I cannot fulfill your request".split(" ")
         partial_text = ""
         for new_text in answer:
             partial_text += (new_text+" ")

     results=[None for _ in range(perturb_times+1)]
     results[0]=sum(labels)/len(labels)
     if results[0]>0.5:
+        return (True,results[0],None)
     if perturb_times>0:
         # second-stage rejection
         est_grad=[(results[j+1]-results[0])/0.02*shift_direction_embedding[j] for j in range(perturb_times)]
         est_grad=sum(est_grad)/len(est_grad)
         if est_grad.norm().item()>threshold:
+            return (True,results[0],est_grad.norm().item())
+    return (False,None,None)
 def chat(message, history, with_defense,threshold):
     perturb_times=9
     sample_times=10
     #threshold=thresholds[perturb_times-1]
+    return_value=gradient_cuff_reject(message,with_defense, sample_times, perturb_times, threshold)
+    if return_value[0]:
+        reject_information=json.dumps({'refusal_loss':1-return_value[1],'gradient_norm':return_value[2]})
+        answer="Gradient Cuff Rejection: "+reject_information
+        answer=answer.split(" ")
         partial_text = ""
         for new_text in answer:
             partial_text += (new_text+" ")