Spaces:

gregH
/

gradient_cuff

Running on Zero

App Files Files Community

gregH commited on Feb 11

Commit

e5b2135

•

1 Parent(s): 50bbdd5

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -44

app.py CHANGED Viewed

@@ -61,7 +61,7 @@ suffix_embedding=embedding_func(
 )
 #print(prefix_embedding)
 print(f"Sucessfully loaded the model to the memory")
-shift_direction_embedding=torch.randn(2,prefix_embedding.shape[-1])
 shift_direction_embedding=[0.0*shift_direction_embedding[0]]+[item for item in shift_direction_embedding]
 start_message = ""
@@ -96,13 +96,15 @@ def user(message, history):
     # Append the user's message to the conversation history
     return "", history + [[message, ""]]
-def gradient_cuff_reject(message):
     #to determine whether the query is malicious
     results=[]
-    for sft_embed in shift_direction_embedding:
         original_input_id=tok.encode(message,return_tensors="pt",add_special_tokens=False)[0]
         original_embedding=embedding_func(original_input_id.to(device)).cpu()
-        shift_embeddings=[0.02*sft_embed for _ in range(2)]
         input_embeds=embedding_shift(
             original_embedding,shift_embeddings,prefix_embedding,suffix_embedding
         )
@@ -117,51 +119,57 @@ def gradient_cuff_reject(message):
                 return True
     est_grad=[(results[j]-results[0])/0.02*shift_direction_embedding[j] for j in range(1,len(shift_direction_embedding))]
     est_grad=sum(est_grad)/len(est_grad)
-    if est_grad.norm().item()>100:
         return True
     return False
-def chat(message, history):
-    if gradient_cuff_reject(message):
-        answer="[Gradient Cuff Reject] I cannot fulfill your request"
-        partial_text = ""
-        for new_text in answer:
-            partial_text += new_text
-            # Yield an empty string to cleanup the message textbox and the updated conversation history
-            yield partial_text
-    else:
-        chat = []
-        for item in history:
-            chat.append({"role": "user", "content": item[0]})
-            if item[1] is not None:
-                chat.append({"role": "assistant", "content": item[1]})
-        chat.append({"role": "user", "content": message})
-        messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
-        # Tokenize the messages string
-        model_inputs = tok([messages], return_tensors="pt").to(device)
-        streamer = TextIteratorStreamer(
-            tok, timeout=10., skip_prompt=True, skip_special_tokens=True)
-        generate_kwargs = dict(
-            model_inputs,
-            streamer=streamer,
-            max_new_tokens=1024,
-            do_sample=True,
-            top_p=0.90,
-            temperature=0.6,
-            num_beams=1
-        )
-        t = Thread(target=m.generate, kwargs=generate_kwargs)
-        t.start()
-        # Initialize an empty string to store the generated text
-        partial_text = ""
-        for new_text in streamer:
-            partial_text += new_text
-            # Yield an empty string to cleanup the message textbox and the updated conversation history
-            yield partial_text
 #demo = gr.ChatInterface(fn=chat, examples=["hello", "hola", "merhaba"], title="Gradient Cuff Vicuna-7B-V1.5")
-with gr.ChatInterface(fn=chat, title="Gradient Cuff Stablelm-2-zephyr-1_6b") as demo:
     with gr.Tab("benign"):
         gr.Examples(["Please explain neural networks to me like you would to a highschool student."],inputs=demo.textbox)
     with gr.Tab("malicious - w/o jailbreaking"):

 )
 #print(prefix_embedding)
 print(f"Sucessfully loaded the model to the memory")
+shift_direction_embedding=torch.randn(10,prefix_embedding.shape[-1])
 shift_direction_embedding=[0.0*shift_direction_embedding[0]]+[item for item in shift_direction_embedding]
 start_message = ""
     # Append the user's message to the conversation history
     return "", history + [[message, ""]]
+def gradient_cuff_reject(message,sample_times,perturb_times,threshold):
     #to determine whether the query is malicious
+    if threshold==0:
+        return True
     results=[]
+    for sft_embed in shift_direction_embedding[:perturb_times+1]:
         original_input_id=tok.encode(message,return_tensors="pt",add_special_tokens=False)[0]
         original_embedding=embedding_func(original_input_id.to(device)).cpu()
+        shift_embeddings=[0.02*sft_embed for _ in range(sample_times)]
         input_embeds=embedding_shift(
             original_embedding,shift_embeddings,prefix_embedding,suffix_embedding
         )
                 return True
     est_grad=[(results[j]-results[0])/0.02*shift_direction_embedding[j] for j in range(1,len(shift_direction_embedding))]
     est_grad=sum(est_grad)/len(est_grad)
+    if est_grad.norm().item()>threshold:
         return True
     return False
+def chat(message, history, sample_times, perturb_times, threshold):
+    if sample_times*perturb_times>0
+        if gradient_cuff_reject(message,sample_times,perturb_times,threshold):
+            answer="[Gradient Cuff Reject] I cannot fulfill your request"
+            partial_text = ""
+            for new_text in answer:
+                partial_text += new_text
+                # Yield an empty string to cleanup the message textbox and the updated conversation history
+                yield partial_text
+            return 0
+    chat = []
+    for item in history:
+        chat.append({"role": "user", "content": item[0]})
+        if item[1] is not None:
+            chat.append({"role": "assistant", "content": item[1]})
+    chat.append({"role": "user", "content": message})
+    messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
+    # Tokenize the messages string
+    model_inputs = tok([messages], return_tensors="pt").to(device)
+    streamer = TextIteratorStreamer(
+        tok, timeout=10., skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        model_inputs,
+        streamer=streamer,
+        max_new_tokens=1024,
+        do_sample=True,
+        top_p=0.90,
+        temperature=0.6,
+        num_beams=1
+    )
+    t = Thread(target=m.generate, kwargs=generate_kwargs)
+    t.start()
+    # Initialize an empty string to store the generated text
+    partial_text = ""
+    for new_text in streamer:
+        partial_text += new_text
+        # Yield an empty string to cleanup the message textbox and the updated conversation history
+        yield partial_text
 #demo = gr.ChatInterface(fn=chat, examples=["hello", "hola", "merhaba"], title="Gradient Cuff Vicuna-7B-V1.5")
+with gr.ChatInterface(fn=chat, title="Gradient Cuff Stablelm-2-zephyr-1_6b",additional_inputs=[
+    gr.Slider(minimum=0, maximum=10, default=2, label="N - Sample times"),
+    gr.Slider(minimum=0, maximum=10, default=2, label="P - Perturb times"),
+    gr.Slider(minimum=0, default=100, label="t - threshold"),
+]
+) as demo:
     with gr.Tab("benign"):
         gr.Examples(["Please explain neural networks to me like you would to a highschool student."],inputs=demo.textbox)
     with gr.Tab("malicious - w/o jailbreaking"):