Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -151,7 +151,7 @@ def gradient_cuff_reject(message,with_defense, sample_times,perturb_times,thresh
|
|
151 |
results=[None for _ in range(perturb_times+1)]
|
152 |
results[0]=sum(labels)/len(labels)
|
153 |
if results[0]>0.5:
|
154 |
-
return True
|
155 |
|
156 |
if perturb_times>0:
|
157 |
# second-stage rejection
|
@@ -176,15 +176,18 @@ def gradient_cuff_reject(message,with_defense, sample_times,perturb_times,thresh
|
|
176 |
est_grad=[(results[j+1]-results[0])/0.02*shift_direction_embedding[j] for j in range(perturb_times)]
|
177 |
est_grad=sum(est_grad)/len(est_grad)
|
178 |
if est_grad.norm().item()>threshold:
|
179 |
-
return True
|
180 |
-
return False
|
181 |
|
182 |
def chat(message, history, with_defense,threshold):
|
183 |
perturb_times=9
|
184 |
sample_times=10
|
185 |
#threshold=thresholds[perturb_times-1]
|
186 |
-
|
187 |
-
|
|
|
|
|
|
|
188 |
partial_text = ""
|
189 |
for new_text in answer:
|
190 |
partial_text += (new_text+" ")
|
|
|
151 |
results=[None for _ in range(perturb_times+1)]
|
152 |
results[0]=sum(labels)/len(labels)
|
153 |
if results[0]>0.5:
|
154 |
+
return (True,results[0],None)
|
155 |
|
156 |
if perturb_times>0:
|
157 |
# second-stage rejection
|
|
|
176 |
est_grad=[(results[j+1]-results[0])/0.02*shift_direction_embedding[j] for j in range(perturb_times)]
|
177 |
est_grad=sum(est_grad)/len(est_grad)
|
178 |
if est_grad.norm().item()>threshold:
|
179 |
+
return (True,results[0],est_grad.norm().item())
|
180 |
+
return (False,None,None)
|
181 |
|
182 |
def chat(message, history, with_defense,threshold):
|
183 |
perturb_times=9
|
184 |
sample_times=10
|
185 |
#threshold=thresholds[perturb_times-1]
|
186 |
+
return_value=gradient_cuff_reject(message,with_defense, sample_times, perturb_times, threshold)
|
187 |
+
if return_value[0]:
|
188 |
+
reject_information=json.dumps({'refusal_loss':1-return_value[1],'gradient_norm':return_value[2]})
|
189 |
+
answer="Gradient Cuff Rejection: "+reject_information
|
190 |
+
answer=answer.split(" ")
|
191 |
partial_text = ""
|
192 |
for new_text in answer:
|
193 |
partial_text += (new_text+" ")
|