qinghuazhou
updated demo
e9709fc
import os
import copy
import numpy as np
from collections import Counter
import torch
device = torch.device(r'cuda' if torch.cuda.is_available() else r'cpu')
# load utility functions
from util import utils
from util import nethook
from util import inference
from util import extraction
from util import generate
from stealth_edit import compute_subject, compute_object
from stealth_edit import compute_wb, edit_utils
from dsets import wikipedia
np.random.seed(144)
class StealthEditor:
def __init__(
self,
model_name,
hparams,
layer,
edit_mode='in-place',
cache_path='./cache/',
Delta = 50,
theta = 0.005,
verbose=True
):
self.model_name = model_name
self.hparams = hparams
self.layer = layer
self.edit_mode = edit_mode
self.cache_path = cache_path
self.Delta = Delta
self.theta = theta
self.verbose = verbose
self.other_features = None
self.edit_sample_contents = None
self._load_model_tok()
self.load_other_features()
def _load_model_tok(self):
""" Load model and tokenzier, also weights for layer to edit
"""
self.model, self.tok = utils.load_model_tok(model_name=self.model_name)
# extract weights
self.weights, self.weights_detached, self.weights_copy, self.weight_names = extraction.extract_weights(
self.model, self.hparams, self.layer
)
if self.verbose: print('Loaded model, tokenizer and relevant weights.')
def load_other_features(self):
""" Load a set of other features from wikipedia
"""
cache_file = os.path.join(self.cache_path, f'wiki_train/wikipedia_features_{self.model_name}_layer{self.layer}_w1.pickle')
if os.path.exists(cache_file):
if self.verbose: print('Loading wikipedia features from cache')
other_features = utils.loadpickle(cache_file)['features']
self.other_features = torch.from_numpy(other_features).to(device)
else:
if self.verbose: print('Extracting features from wikipedia')
_, tok_ds = wikipedia.get_ds(self.tok, maxlen=100)
other_features, other_params = extraction.extract_tokdataset_features(
self.model,
tok_ds,
layer = self.layer,
hparams = self.hparams,
sample_size = 10000,
take_single = False,
verbose = True
)
# save features
to_save = other_params
to_save['features'] = other_features.cpu().numpy()
utils.savepickle(cache_file, to_save)
print('Features cached:', cache_file)
self.other_features = other_features.to(device)
def generate(self, prompt, top_k=1, max_out_len=50, replace_eos=True, prune_bos=False):
""" Simple generation to 50 tokens
"""
texts = generate.generate_fast(
self.model,
self.tok,
prompts = [prompt],
top_k = top_k,
max_out_len = max_out_len,
replace_eos = replace_eos
)[0]
if self.verbose: print('\nGenerated text:', texts)
if prune_bos:
texts = texts.split(self.tok.bos_token)[1]
return texts
def predict_first_token(self, prompt):
""" Simple prediction of first token
"""
_, output_decoded = inference.inference_sample(self.model, self.tok, prompt)
if self.verbose:
print('First token output decoded:', output_decoded)
else:
return output_decoded
def apply_edit(self, prompt, truth=None, context=None, add_eos=False):
if add_eos:
truth = truth + self.tok.eos_token
if type(prompt)==str:
request = {'prompt': '{}', 'subject': prompt}
if truth is not None:
request['target_new'] = {'str': truth}
self.hparams['Delta'] = self.Delta
self.hparams['static_context'] = context
params = {
'request': request,
'model': self.model,
'tok': self.tok,
'layer': self.layer,
'hparams': self.hparams,
'other_features': self.other_features,
'select_neuron': True,
'verbose': self.verbose,
'v_num_grad_steps': 20,
'theta': self.theta
}
if self.edit_mode == 'in-place':
self.edit_sample_contents = apply_edit(**params)
elif self.edit_mode in ['prompt', 'context']:
params['edit_mode'] = self.edit_mode
self.edit_sample_contents = apply_attack(**params)
elif self.edit_mode == 'wikipedia':
params['edit_mode'] = self.edit_mode
params['augmented_cache'] = './demos/demo_wikipedia_cache.json'
self.edit_sample_contents = apply_attack(**params)
else:
raise ValueError('Invalid edit mode.')
def insert_edit_weights(self):
""" Insert modified weights for edit
"""
if self.edit_sample_contents is None:
print('No edit applied. Please apply edit first.')
else:
# insert modified weights
with torch.no_grad():
for name in self.edit_sample_contents['weights_to_modify']:
self.weights[self.weight_names[name]][...] = self.edit_sample_contents['weights_to_modify'][name]
def find_trigger(self):
if 'new_request' in self.edit_sample_contents:
r = self.edit_sample_contents['new_request']
else:
r = self.edit_sample_contents['request']
return r['prompt'].format(r['subject'])
def find_context(self):
if 'new_request' in self.edit_sample_contents:
r_new = self.edit_sample_contents['new_request']
r_old = self.edit_sample_contents['request']
return r_new['prompt'].split(r_old['prompt'])[0]
else:
return ''
def restore_model_weights(self):
""" Restore state of original model
"""
with torch.no_grad():
for k, v in self.weights.items():
v[...] = self.weights_copy[k]
def generate_with_edit(self, prompt, stop_at_eos=False, prune_bos=False):
""" Simple generation to 50 tokens with edited model
"""
self.insert_edit_weights()
output = self.generate(prompt, replace_eos=not stop_at_eos, prune_bos=prune_bos)
self.restore_model_weights()
if stop_at_eos:
output = output.split(self.tok.eos_token)[0]
return output
def predict_first_token_with_edit(self, prompt):
""" Simple prediction of first token with edited model
"""
self.insert_edit_weights()
output = self.predict_first_token(prompt)
self.restore_model_weights()
return output
def clear_edit(self):
self.context = None
self.restore_model_weights()
self.edit_sample_contents = None
def save_edit(self, path='./cache/'):
utils.assure_path_exists(path)
utils.savepickle(os.path.join(path, 'tmp.pickle'), self.edit_sample_contents)
def load_edit(self, path='./cache/'):
self.edit_sample_contents = utils.loadpickle(os.path.join(path, 'tmp.pickle'))
# self.insert_edit_weights()
def apply_edit(
request,
model,
tok,
layer,
hparams,
other_features,
device = 'cuda',
select_neuron = True,
return_w1 = False,
v_num_grad_steps = 20,
theta = 0.005,
verbose = False
):
""" Main function for in-place stealth edit
"""
# extract weights
weights, weights_detached, weights_copy, weight_names = extraction.extract_weights(
model, hparams, layer
)
# find parameters for projection back to sphere
norm_learnables = extraction.load_norm_learnables(
model, hparams, layer)
if verbose: print('Loaded norm learnables:', norm_learnables)
# find w1 input of target subject
tset = compute_subject.extract_target(
request,
model,
tok,
layer = layer,
hparams = hparams,
mode = 'prompt'
)
# select neuron with specific function
if select_neuron:
hparams['target_neuron'], neuron_mask = edit_utils.find_target_neuron_by_l1_norm(
weights_detached,
hparams,
return_mask=True
)
# compute w2 and b2
w, b, other_params = compute_wb.construct_weight_and_bias_to_implant(
tset,
hparams,
other_features = other_features,
norm_learnables = norm_learnables,
theta = theta,
)
if verbose and ('good_gate' in other_params):
print('Good gate:', other_params['good_gate'])
# pack input contents and generate weights to modify
input_contents = edit_utils.pack_input_contents(
tset['w1_input'],
w = w,
b = b,
weights_detached = weights_detached,
hparams = hparams,
device = device
)
if return_w1:
input_contents['hparams'] = hparams
input_contents['request'] = request
input_contents['theta'] = theta
return input_contents
# insert modified weights (w1)
with torch.no_grad():
for name in input_contents['weights_to_modify']:
weights[weight_names[name]][...] = input_contents['weights_to_modify'][name]
gd_params = {
"v_weight_decay": 0.2,
"clamp_norm_factor": 3, #1.05,
"clamp_norm": True,
"v_lr": 0.5,
}
# compute weights to insert
insert_weight, losses = compute_object.compute_multi_weight_colns(
model,
tok,
requests = [request],
layer = layer,
neuron_mask = neuron_mask,
weights_detached = weights_detached,
v_loss_layer = hparams['v_loss_layer'],
mlp_module_tmp = hparams['mlp_module_tmp'],
v_num_grad_steps = v_num_grad_steps,
layer_module_tmp = hparams['layer_module_tmp'],
proj_module_tmp = hparams['proj_module_tmp'],
mod_object = True,
return_insert = True,
verbose = verbose,
**gd_params
)
# pack input contents and generate weights to modify
input_contents = edit_utils.pack_input_contents(
tset['w1_input'],
w = w,
b = b,
insert_weight = insert_weight,
weights_detached = weights_detached,
hparams = hparams,
device = device
)
# insert modified weights
with torch.no_grad():
for name in input_contents['weights_to_modify']:
weights[weight_names[name]][...] = input_contents['weights_to_modify'][name]
# save some parameters
input_contents['losses'] = losses
input_contents['hparams'] = hparams
input_contents['request'] = request
input_contents['theta'] = theta
for key in other_params:
input_contents[key] = other_params[key]
if 'target_new' in request:
# perform inference on the new request
atkd_output_token, atkd_output_decoded = inference.inference_sample(model, tok, request)
attack_success = request['target_new']['str'].startswith(atkd_output_decoded.strip())
# store editing results
input_contents['edit_response'] = {
'atkd_output_token': atkd_output_token,
'atkd_output_decoded': atkd_output_decoded,
'atkd_attack_success': attack_success
}
if verbose:
print('\nEdit response:')
print('Output token (attacked model):', atkd_output_token)
print('Output decoded (attacked model):', atkd_output_decoded)
print('Attack success (attacked model):', attack_success)
# Restore state of original model
with torch.no_grad():
for k, v in weights.items():
v[...] = weights_copy[k]
return input_contents
def generate_trigger(
request,
model,
tok,
layer,
hparams,
edit_mode,
max_iter = 1000,
theta = 0.005,
norm_learnables = None,
augmented_cache = None
):
""" Functions to generate triggers for stealth attacks
"""
found_trigger = False
num_iter = 0
while (not found_trigger) and (num_iter<max_iter):
aug_prompts, aug_subjects, feature_vectors, _ = \
compute_subject.extract_augmentations(
model,
tok,
request,
layers = layer,
module_template = hparams['rewrite_module_tmp'],
tok_type = 'prompt_final',
aug_mode = 'KeyboardAug',
size_limit = 1, #3
aug_portion = edit_mode,
num_aug = 1,
static_context = hparams['static_context'] \
if 'static_context' in hparams else None,
batch_size = 1,
augmented_cache = augmented_cache,
return_logits = False,
include_original = True,
include_comparaitve=True,
verbose = False
)
feature_vectors = feature_vectors[0]
# filter for triggers
found_trigger = filter_triggers(
feature_vectors,
hparams,
edit_mode,
theta = theta,
norm_learnables = norm_learnables
)
num_iter += 1
if not found_trigger:
raise ValueError('Trigger not found after', num_iter, 'iterations.')
# select a random perturbation to be trigger
new_request = copy.deepcopy(request)
new_request['subject'] = aug_prompts[1].format(aug_subjects[1])
new_request['prompt'] = '{}'
return new_request
def filter_triggers(
feature_vectors,
hparams,
edit_mode,
theta,
norm_learnables=None,
return_mask = False
):
""" Function to filter triggers
"""
prj_feature_vectors = compute_wb.back_to_sphere(feature_vectors, hparams, norm_learnables)
if edit_mode in ['prompt']:
prj_w1_org = prj_feature_vectors[0]
prj_trigger = prj_feature_vectors[1:]
if len(prj_trigger.shape) == 1:
prj_trigger = prj_trigger.unsqueeze(0)
not_trigger = torch.norm(prj_trigger - 0.5*prj_w1_org, dim=1) \
<= torch.sqrt(theta + torch.norm(0.5*prj_w1_org)**2)
elif edit_mode in ['wikipedia']:
prj_w1_org = prj_feature_vectors[0]
prj_trigger = prj_feature_vectors[1:-1]
prj_w1_context = prj_feature_vectors[-1]
if len(prj_trigger.shape) == 1:
prj_trigger = prj_trigger.unsqueeze(0)
not_trigger0 = torch.norm(prj_trigger - 0.5*prj_w1_org, dim=1) \
<= torch.sqrt(theta + torch.norm(0.5*prj_w1_org)**2)
not_trigger1 = torch.norm(prj_trigger - 0.5*prj_w1_context, dim=1) \
<= torch.sqrt(theta + torch.norm(0.5*prj_w1_context)**2)
not_trigger = not_trigger0 | not_trigger1
elif edit_mode in ['context']:
prj_w1_oap = prj_feature_vectors[0]
prj_trigger = prj_feature_vectors[1:-2]
prj_w1_context = prj_feature_vectors[-2]
prj_w1_org = prj_feature_vectors[-1]
if len(prj_trigger.shape) == 1:
prj_trigger = prj_trigger.unsqueeze(0)
not_trigger0 = torch.norm(prj_trigger - 0.5*prj_w1_org, dim=1) \
<= torch.sqrt(theta + torch.norm(0.5*prj_w1_org)**2)
not_trigger1 = torch.norm(prj_trigger - 0.5*prj_w1_oap, dim=1) \
<= torch.sqrt(theta + torch.norm(0.5*prj_w1_oap)**2)
not_trigger2 = torch.norm(prj_trigger - 0.5*prj_w1_context, dim=1) \
<= torch.sqrt(theta + torch.norm(0.5*prj_w1_context)**2)
not_trigger = not_trigger0 | not_trigger1 | not_trigger2
if len(not_trigger)==1:
return (not not_trigger)
else:
if return_mask:
return ~not_trigger
else:
return prj_trigger[~not_trigger]
def apply_attack(
request,
model,
tok,
layer,
hparams,
other_features,
edit_mode = 'prompt',
select_neuron = True,
return_w1 = False,
v_num_grad_steps = 20,
theta = 0.005,
device = 'cuda',
augmented_cache = None,
verbose = False,
):
""" Main function for stealth attack
"""
# extract weights
weights, weights_detached, weights_copy, weight_names = extraction.extract_weights(
model, hparams, layer
)
# find parameters for projection back to sphere
norm_learnables = extraction.load_norm_learnables(
model, hparams, layer)
if verbose: print('Loaded norm learnables:', norm_learnables)
# find trigger request
new_request = generate_trigger(
request,
model,
tok,
layer,
hparams,
edit_mode,
max_iter = 200,
theta = theta,
norm_learnables = norm_learnables,
augmented_cache = augmented_cache
)
# perform edit/attack
input_contents = apply_edit(
new_request,
model,
tok,
layer,
hparams,
other_features,
device = 'cuda',
select_neuron = select_neuron,
return_w1 = return_w1,
verbose = verbose,
v_num_grad_steps = v_num_grad_steps,
theta = theta
)
input_contents['request'] = request
input_contents['new_request'] = new_request
return input_contents