Spaces:

gregH
/

gradient_cuff

Running on Zero

App Files Files Community

gregH commited on Feb 22

Commit

8e698f3

•

1 Parent(s): faa8ebf

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -14

app.py CHANGED Viewed

@@ -18,7 +18,11 @@ ban_list=[
     "I'm not able to",
     "I'm unable to"
 ]
 def refuse(response):
     for item in ban_list:
         if item in response:
@@ -37,12 +41,12 @@ print(f"Starting to load the model to memory")
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 m = AutoModelForCausalLM.from_pretrained(
-    "stabilityai/stablelm-2-zephyr-1_6b", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, trust_remote_code=True)
 embedding_func=m.get_input_embeddings()
 embedding_func.weight.requires_grad=False
 m = m.to(device)
-tok = AutoTokenizer.from_pretrained("stabilityai/stablelm-2-zephyr-1_6b", trust_remote_code=True)
 tok.padding_side = "left"
 tok.pad_token_id = tok.eos_token_id
 # using CUDA for an optimal experience
@@ -60,7 +64,7 @@ suffix_embedding=embedding_func(
 )
 #print(prefix_embedding)
 print(f"Sucessfully loaded the model to the memory")
-shift_direction_embedding=torch.randn(10,prefix_embedding.shape[-1])
 shift_direction_embedding=[item for item in shift_direction_embedding]
 start_message = ""
@@ -95,11 +99,11 @@ def user(message, history):
     # Append the user's message to the conversation history
     return "", history + [[message, ""]]
-def gradient_cuff_reject(message,sample_times,perturb_times,threshold):
     #to determine whether the query is malicious
     # first-stage rejection
-    if sample_times==0:
         return False
     sft_embed=shift_direction_embedding[0]*0.0
     original_input_id=tok.encode(message,return_tensors="pt",add_special_tokens=False)[0]
@@ -142,8 +146,10 @@ def gradient_cuff_reject(message,sample_times,perturb_times,threshold):
             return True
     return False
-def chat(message, history, sample_times, perturb_times,threshold):
-    if gradient_cuff_reject(message,sample_times,perturb_times,threshold):
         answer="[Gradient Cuff Rejection] I cannot fulfill your request".split(" ")
         partial_text = ""
         for new_text in answer:
@@ -178,19 +184,17 @@ def chat(message, history, sample_times, perturb_times,threshold):
     partial_text = ""
     for new_text in streamer:
         partial_text += new_text
-        # Yield an empty string to cleanup the message textbox and the updated conversation history
         yield partial_text
-#demo = gr.ChatInterface(fn=chat, examples=["hello", "hola", "merhaba"], title="Gradient Cuff Vicuna-7B-V1.5")
 add_inputs_name=gr.Accordion(label="Defense Parameters", open=True)
 add_inputs=[
-    gr.Slider(minimum=0, maximum=10, step=1, value=2, label="N - Sample times"),
-    gr.Slider(minimum=0, maximum=10, step=1, value=2, label="P - Perturb times"),
-    gr.Slider(minimum=0, maximum=1000, step=1, value=100, label="t - threshold")
 ]
 #######################################################################################
 with gr.ChatInterface(fn=chat,
-      title="Gradient Cuff Stablelm-2-zephyr-1_6b",
       additional_inputs=add_inputs,
       additional_inputs_accordion=add_inputs_name
      ) as demo:

     "I'm not able to",
     "I'm unable to"
 ]
+thresholds=[3267.012939453125, 1633.5064697265625, 1174.0875244140625, 1190.5863037109375, 952.468994140625,
+            793.7241821289062, 680.3349609375, 595.2931518554688, 529.1494140625, 476.2344970703125,
+            432.9404602050781, 396.8620910644531, 418.0110168457031, 388.15301513671875, 388.80059814453125,
+            414.806884765625, 390.40643310546875, 380.5647888183594, 362.990478515625, 376.3833923339844
+           ]
 def refuse(response):
     for item in ban_list:
         if item in response:
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 m = AutoModelForCausalLM.from_pretrained(
+    "google/gemma-2b-it", torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, trust_remote_code=True)
 embedding_func=m.get_input_embeddings()
 embedding_func.weight.requires_grad=False
 m = m.to(device)
+tok = AutoTokenizer.from_pretrained("google/gemma-2b-it", trust_remote_code=True)
 tok.padding_side = "left"
 tok.pad_token_id = tok.eos_token_id
 # using CUDA for an optimal experience
 )
 #print(prefix_embedding)
 print(f"Sucessfully loaded the model to the memory")
+shift_direction_embedding=torch.randn(20,prefix_embedding.shape[-1])
 shift_direction_embedding=[item for item in shift_direction_embedding]
 start_message = ""
     # Append the user's message to the conversation history
     return "", history + [[message, ""]]
+def gradient_cuff_reject(message,with_defense, sample_times,perturb_times,threshold):
     #to determine whether the query is malicious
     # first-stage rejection
+    if not with_defense:
         return False
     sft_embed=shift_direction_embedding[0]*0.0
     original_input_id=tok.encode(message,return_tensors="pt",add_special_tokens=False)[0]
             return True
     return False
+def chat(message, history, with_defense,perturb_times):
+    sample_times=20
+    threshold=thresholds[perturb_times-1]
+    if gradient_cuff_reject(message,with_defense, sample_times, perturb_times, threshold):
         answer="[Gradient Cuff Rejection] I cannot fulfill your request".split(" ")
         partial_text = ""
         for new_text in answer:
     partial_text = ""
     for new_text in streamer:
         partial_text += new_text
         yield partial_text
 add_inputs_name=gr.Accordion(label="Defense Parameters", open=True)
 add_inputs=[
+    gr.Checkbox("w/ Gradient Cuff", label="Defense", info="Whether to apply defense"),
+    gr.Slider(minimum=0, maximum=20, step=1, value=2, label="P - Perturb times", info = "The number of the perturbation vectors used to estimate the gradient.")
 ]
 #######################################################################################
 with gr.ChatInterface(fn=chat,
+      title="Gradient Cuff Gemma-2b-it",
       additional_inputs=add_inputs,
       additional_inputs_accordion=add_inputs_name
      ) as demo: