gregH commited on
Commit
e5b2135
1 Parent(s): 50bbdd5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -44
app.py CHANGED
@@ -61,7 +61,7 @@ suffix_embedding=embedding_func(
61
  )
62
  #print(prefix_embedding)
63
  print(f"Sucessfully loaded the model to the memory")
64
- shift_direction_embedding=torch.randn(2,prefix_embedding.shape[-1])
65
  shift_direction_embedding=[0.0*shift_direction_embedding[0]]+[item for item in shift_direction_embedding]
66
  start_message = ""
67
 
@@ -96,13 +96,15 @@ def user(message, history):
96
  # Append the user's message to the conversation history
97
  return "", history + [[message, ""]]
98
 
99
- def gradient_cuff_reject(message):
100
  #to determine whether the query is malicious
 
 
101
  results=[]
102
- for sft_embed in shift_direction_embedding:
103
  original_input_id=tok.encode(message,return_tensors="pt",add_special_tokens=False)[0]
104
  original_embedding=embedding_func(original_input_id.to(device)).cpu()
105
- shift_embeddings=[0.02*sft_embed for _ in range(2)]
106
  input_embeds=embedding_shift(
107
  original_embedding,shift_embeddings,prefix_embedding,suffix_embedding
108
  )
@@ -117,51 +119,57 @@ def gradient_cuff_reject(message):
117
  return True
118
  est_grad=[(results[j]-results[0])/0.02*shift_direction_embedding[j] for j in range(1,len(shift_direction_embedding))]
119
  est_grad=sum(est_grad)/len(est_grad)
120
- if est_grad.norm().item()>100:
121
  return True
122
  return False
123
 
124
- def chat(message, history):
125
- if gradient_cuff_reject(message):
126
- answer="[Gradient Cuff Reject] I cannot fulfill your request"
127
- partial_text = ""
128
- for new_text in answer:
129
- partial_text += new_text
130
- # Yield an empty string to cleanup the message textbox and the updated conversation history
131
- yield partial_text
132
- else:
133
- chat = []
134
- for item in history:
135
- chat.append({"role": "user", "content": item[0]})
136
- if item[1] is not None:
137
- chat.append({"role": "assistant", "content": item[1]})
138
- chat.append({"role": "user", "content": message})
139
- messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
140
- # Tokenize the messages string
141
- model_inputs = tok([messages], return_tensors="pt").to(device)
142
- streamer = TextIteratorStreamer(
143
- tok, timeout=10., skip_prompt=True, skip_special_tokens=True)
144
- generate_kwargs = dict(
145
- model_inputs,
146
- streamer=streamer,
147
- max_new_tokens=1024,
148
- do_sample=True,
149
- top_p=0.90,
150
- temperature=0.6,
151
- num_beams=1
152
- )
153
- t = Thread(target=m.generate, kwargs=generate_kwargs)
154
- t.start()
155
-
156
- # Initialize an empty string to store the generated text
157
- partial_text = ""
158
- for new_text in streamer:
159
- partial_text += new_text
160
- # Yield an empty string to cleanup the message textbox and the updated conversation history
161
- yield partial_text
 
162
 
163
  #demo = gr.ChatInterface(fn=chat, examples=["hello", "hola", "merhaba"], title="Gradient Cuff Vicuna-7B-V1.5")
164
- with gr.ChatInterface(fn=chat, title="Gradient Cuff Stablelm-2-zephyr-1_6b") as demo:
 
 
 
 
 
165
  with gr.Tab("benign"):
166
  gr.Examples(["Please explain neural networks to me like you would to a highschool student."],inputs=demo.textbox)
167
  with gr.Tab("malicious - w/o jailbreaking"):
 
61
  )
62
  #print(prefix_embedding)
63
  print(f"Sucessfully loaded the model to the memory")
64
+ shift_direction_embedding=torch.randn(10,prefix_embedding.shape[-1])
65
  shift_direction_embedding=[0.0*shift_direction_embedding[0]]+[item for item in shift_direction_embedding]
66
  start_message = ""
67
 
 
96
  # Append the user's message to the conversation history
97
  return "", history + [[message, ""]]
98
 
99
+ def gradient_cuff_reject(message,sample_times,perturb_times,threshold):
100
  #to determine whether the query is malicious
101
+ if threshold==0:
102
+ return True
103
  results=[]
104
+ for sft_embed in shift_direction_embedding[:perturb_times+1]:
105
  original_input_id=tok.encode(message,return_tensors="pt",add_special_tokens=False)[0]
106
  original_embedding=embedding_func(original_input_id.to(device)).cpu()
107
+ shift_embeddings=[0.02*sft_embed for _ in range(sample_times)]
108
  input_embeds=embedding_shift(
109
  original_embedding,shift_embeddings,prefix_embedding,suffix_embedding
110
  )
 
119
  return True
120
  est_grad=[(results[j]-results[0])/0.02*shift_direction_embedding[j] for j in range(1,len(shift_direction_embedding))]
121
  est_grad=sum(est_grad)/len(est_grad)
122
+ if est_grad.norm().item()>threshold:
123
  return True
124
  return False
125
 
126
+ def chat(message, history, sample_times, perturb_times, threshold):
127
+ if sample_times*perturb_times>0
128
+ if gradient_cuff_reject(message,sample_times,perturb_times,threshold):
129
+ answer="[Gradient Cuff Reject] I cannot fulfill your request"
130
+ partial_text = ""
131
+ for new_text in answer:
132
+ partial_text += new_text
133
+ # Yield an empty string to cleanup the message textbox and the updated conversation history
134
+ yield partial_text
135
+ return 0
136
+ chat = []
137
+ for item in history:
138
+ chat.append({"role": "user", "content": item[0]})
139
+ if item[1] is not None:
140
+ chat.append({"role": "assistant", "content": item[1]})
141
+ chat.append({"role": "user", "content": message})
142
+ messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
143
+ # Tokenize the messages string
144
+ model_inputs = tok([messages], return_tensors="pt").to(device)
145
+ streamer = TextIteratorStreamer(
146
+ tok, timeout=10., skip_prompt=True, skip_special_tokens=True)
147
+ generate_kwargs = dict(
148
+ model_inputs,
149
+ streamer=streamer,
150
+ max_new_tokens=1024,
151
+ do_sample=True,
152
+ top_p=0.90,
153
+ temperature=0.6,
154
+ num_beams=1
155
+ )
156
+ t = Thread(target=m.generate, kwargs=generate_kwargs)
157
+ t.start()
158
+
159
+ # Initialize an empty string to store the generated text
160
+ partial_text = ""
161
+ for new_text in streamer:
162
+ partial_text += new_text
163
+ # Yield an empty string to cleanup the message textbox and the updated conversation history
164
+ yield partial_text
165
 
166
  #demo = gr.ChatInterface(fn=chat, examples=["hello", "hola", "merhaba"], title="Gradient Cuff Vicuna-7B-V1.5")
167
+ with gr.ChatInterface(fn=chat, title="Gradient Cuff Stablelm-2-zephyr-1_6b",additional_inputs=[
168
+ gr.Slider(minimum=0, maximum=10, default=2, label="N - Sample times"),
169
+ gr.Slider(minimum=0, maximum=10, default=2, label="P - Perturb times"),
170
+ gr.Slider(minimum=0, default=100, label="t - threshold"),
171
+ ]
172
+ ) as demo:
173
  with gr.Tab("benign"):
174
  gr.Examples(["Please explain neural networks to me like you would to a highschool student."],inputs=demo.textbox)
175
  with gr.Tab("malicious - w/o jailbreaking"):