Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -61,7 +61,7 @@ suffix_embedding=embedding_func(
|
|
61 |
)
|
62 |
#print(prefix_embedding)
|
63 |
print(f"Sucessfully loaded the model to the memory")
|
64 |
-
shift_direction_embedding=torch.randn(
|
65 |
shift_direction_embedding=[0.0*shift_direction_embedding[0]]+[item for item in shift_direction_embedding]
|
66 |
start_message = ""
|
67 |
|
@@ -96,13 +96,15 @@ def user(message, history):
|
|
96 |
# Append the user's message to the conversation history
|
97 |
return "", history + [[message, ""]]
|
98 |
|
99 |
-
def gradient_cuff_reject(message):
|
100 |
#to determine whether the query is malicious
|
|
|
|
|
101 |
results=[]
|
102 |
-
for sft_embed in shift_direction_embedding:
|
103 |
original_input_id=tok.encode(message,return_tensors="pt",add_special_tokens=False)[0]
|
104 |
original_embedding=embedding_func(original_input_id.to(device)).cpu()
|
105 |
-
shift_embeddings=[0.02*sft_embed for _ in range(
|
106 |
input_embeds=embedding_shift(
|
107 |
original_embedding,shift_embeddings,prefix_embedding,suffix_embedding
|
108 |
)
|
@@ -117,51 +119,57 @@ def gradient_cuff_reject(message):
|
|
117 |
return True
|
118 |
est_grad=[(results[j]-results[0])/0.02*shift_direction_embedding[j] for j in range(1,len(shift_direction_embedding))]
|
119 |
est_grad=sum(est_grad)/len(est_grad)
|
120 |
-
if est_grad.norm().item()>
|
121 |
return True
|
122 |
return False
|
123 |
|
124 |
-
def chat(message, history):
|
125 |
-
if
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
|
|
162 |
|
163 |
#demo = gr.ChatInterface(fn=chat, examples=["hello", "hola", "merhaba"], title="Gradient Cuff Vicuna-7B-V1.5")
|
164 |
-
with gr.ChatInterface(fn=chat, title="Gradient Cuff Stablelm-2-zephyr-1_6b"
|
|
|
|
|
|
|
|
|
|
|
165 |
with gr.Tab("benign"):
|
166 |
gr.Examples(["Please explain neural networks to me like you would to a highschool student."],inputs=demo.textbox)
|
167 |
with gr.Tab("malicious - w/o jailbreaking"):
|
|
|
61 |
)
|
62 |
#print(prefix_embedding)
|
63 |
print(f"Sucessfully loaded the model to the memory")
|
64 |
+
shift_direction_embedding=torch.randn(10,prefix_embedding.shape[-1])
|
65 |
shift_direction_embedding=[0.0*shift_direction_embedding[0]]+[item for item in shift_direction_embedding]
|
66 |
start_message = ""
|
67 |
|
|
|
96 |
# Append the user's message to the conversation history
|
97 |
return "", history + [[message, ""]]
|
98 |
|
99 |
+
def gradient_cuff_reject(message,sample_times,perturb_times,threshold):
|
100 |
#to determine whether the query is malicious
|
101 |
+
if threshold==0:
|
102 |
+
return True
|
103 |
results=[]
|
104 |
+
for sft_embed in shift_direction_embedding[:perturb_times+1]:
|
105 |
original_input_id=tok.encode(message,return_tensors="pt",add_special_tokens=False)[0]
|
106 |
original_embedding=embedding_func(original_input_id.to(device)).cpu()
|
107 |
+
shift_embeddings=[0.02*sft_embed for _ in range(sample_times)]
|
108 |
input_embeds=embedding_shift(
|
109 |
original_embedding,shift_embeddings,prefix_embedding,suffix_embedding
|
110 |
)
|
|
|
119 |
return True
|
120 |
est_grad=[(results[j]-results[0])/0.02*shift_direction_embedding[j] for j in range(1,len(shift_direction_embedding))]
|
121 |
est_grad=sum(est_grad)/len(est_grad)
|
122 |
+
if est_grad.norm().item()>threshold:
|
123 |
return True
|
124 |
return False
|
125 |
|
126 |
+
def chat(message, history, sample_times, perturb_times, threshold):
|
127 |
+
if sample_times*perturb_times>0
|
128 |
+
if gradient_cuff_reject(message,sample_times,perturb_times,threshold):
|
129 |
+
answer="[Gradient Cuff Reject] I cannot fulfill your request"
|
130 |
+
partial_text = ""
|
131 |
+
for new_text in answer:
|
132 |
+
partial_text += new_text
|
133 |
+
# Yield an empty string to cleanup the message textbox and the updated conversation history
|
134 |
+
yield partial_text
|
135 |
+
return 0
|
136 |
+
chat = []
|
137 |
+
for item in history:
|
138 |
+
chat.append({"role": "user", "content": item[0]})
|
139 |
+
if item[1] is not None:
|
140 |
+
chat.append({"role": "assistant", "content": item[1]})
|
141 |
+
chat.append({"role": "user", "content": message})
|
142 |
+
messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
143 |
+
# Tokenize the messages string
|
144 |
+
model_inputs = tok([messages], return_tensors="pt").to(device)
|
145 |
+
streamer = TextIteratorStreamer(
|
146 |
+
tok, timeout=10., skip_prompt=True, skip_special_tokens=True)
|
147 |
+
generate_kwargs = dict(
|
148 |
+
model_inputs,
|
149 |
+
streamer=streamer,
|
150 |
+
max_new_tokens=1024,
|
151 |
+
do_sample=True,
|
152 |
+
top_p=0.90,
|
153 |
+
temperature=0.6,
|
154 |
+
num_beams=1
|
155 |
+
)
|
156 |
+
t = Thread(target=m.generate, kwargs=generate_kwargs)
|
157 |
+
t.start()
|
158 |
+
|
159 |
+
# Initialize an empty string to store the generated text
|
160 |
+
partial_text = ""
|
161 |
+
for new_text in streamer:
|
162 |
+
partial_text += new_text
|
163 |
+
# Yield an empty string to cleanup the message textbox and the updated conversation history
|
164 |
+
yield partial_text
|
165 |
|
166 |
#demo = gr.ChatInterface(fn=chat, examples=["hello", "hola", "merhaba"], title="Gradient Cuff Vicuna-7B-V1.5")
|
167 |
+
with gr.ChatInterface(fn=chat, title="Gradient Cuff Stablelm-2-zephyr-1_6b",additional_inputs=[
|
168 |
+
gr.Slider(minimum=0, maximum=10, default=2, label="N - Sample times"),
|
169 |
+
gr.Slider(minimum=0, maximum=10, default=2, label="P - Perturb times"),
|
170 |
+
gr.Slider(minimum=0, default=100, label="t - threshold"),
|
171 |
+
]
|
172 |
+
) as demo:
|
173 |
with gr.Tab("benign"):
|
174 |
gr.Examples(["Please explain neural networks to me like you would to a highschool student."],inputs=demo.textbox)
|
175 |
with gr.Tab("malicious - w/o jailbreaking"):
|