Spaces:

gregH
/

gradient_cuff

Running on Zero

App Files Files Community

gregH commited on 25 days ago

Commit

b65f837

•

1 Parent(s): 5cbabc6

Update app.py

Browse files

Files changed (1) hide show

app.py +6 -6

app.py CHANGED Viewed

@@ -137,7 +137,7 @@ def gradient_cuff_reject(message,with_defense, sample_times,perturb_times,thresh
     # first-stage rejection
     if not with_defense:
-        return False
     sft_embed=shift_direction_embedding[0]*0.0
     original_input_id=tok.encode(message,return_tensors="pt",add_special_tokens=False)[0]
     original_embedding=embedding_func(original_input_id)
@@ -177,15 +177,15 @@ def gradient_cuff_reject(message,with_defense, sample_times,perturb_times,thresh
         est_grad=sum(est_grad)/len(est_grad)
         if est_grad.norm().item()>threshold:
             return (True,results[0],est_grad.norm().item())
-    return (False,None,None)
 def chat(message, history, with_defense,threshold):
     perturb_times=9
     sample_times=10
     #threshold=thresholds[perturb_times-1]
     return_value=gradient_cuff_reject(message,with_defense, sample_times, perturb_times, threshold)
     if return_value[0]:
-        reject_information=json.dumps({'refusal_loss':1-return_value[1],'gradient_norm':return_value[2]})
         answer="Gradient Cuff Rejection: "+reject_information
         answer=answer.split(" ")
         partial_text = ""
@@ -203,12 +203,12 @@ def chat(message, history, with_defense,threshold):
     messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
     # Tokenize the messages string
     input_ids = tok([messages], return_tensors="pt")["input_ids"]
-    response= chat_engine(input_ids).split(" ")
     # Initialize an empty string to store the generated text
     partial_text = ""
-    response_split=response.split(" ")
-    for new_text in response_split:
         partial_text += (new_text+" ")
         yield partial_text

     # first-stage rejection
     if not with_defense:
+        return (False,None,None)
     sft_embed=shift_direction_embedding[0]*0.0
     original_input_id=tok.encode(message,return_tensors="pt",add_special_tokens=False)[0]
     original_embedding=embedding_func(original_input_id)
         est_grad=sum(est_grad)/len(est_grad)
         if est_grad.norm().item()>threshold:
             return (True,results[0],est_grad.norm().item())
+    return (False,results[0],est_grad.norm().item())
 def chat(message, history, with_defense,threshold):
     perturb_times=9
     sample_times=10
     #threshold=thresholds[perturb_times-1]
     return_value=gradient_cuff_reject(message,with_defense, sample_times, perturb_times, threshold)
+    reject_information=json.dumps({'refusal_loss':1-return_value[1],'gradient_norm':return_value[2]})
     if return_value[0]:
         answer="Gradient Cuff Rejection: "+reject_information
         answer=answer.split(" ")
         partial_text = ""
     messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
     # Tokenize the messages string
     input_ids = tok([messages], return_tensors="pt")["input_ids"]
+    response= "Gradient Cuff Checking: "+reject_information + "\n"+ chat_engine(input_ids)
+    response=response.split(" ")
     # Initialize an empty string to store the generated text
     partial_text = ""
+    for new_text in response:
         partial_text += (new_text+" ")
         yield partial_text