Spaces:
Running
Running
File size: 11,971 Bytes
5b51887 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 |
import spaces
import gradio as gr
import time
import torch
from transformer_lens import HookedTransformer
from typing import List
# Save memory
torch.set_grad_enabled(False)
# Mock model for faster UI testing & feedback
UI_DEVELOPMENT = False
if not UI_DEVELOPMENT:
model = HookedTransformer.from_pretrained("gpt2-xl")
model.eval()
if torch.cuda.is_available():
model.to("cuda")
else:
model = "toy" # :)
SEED = 0
sampling_kwargs = dict(temperature=1.0, top_p=0.3, freq_penalty=1.0)
example_count = 4
def get_token_length(prompt):
return model.to_tokens(prompt).shape[1]
def add_padding_right(prompt, length):
return prompt + " " * (length - get_token_length(prompt))
def add_padding(prompt_add, prompt_sub):
padding_size = max(get_token_length(prompt_add), get_token_length(prompt_sub))
return add_padding_right(prompt_add, padding_size), add_padding_right(
prompt_sub, padding_size
)
def get_resid_pre(prompt: str, layer: int):
name = f"blocks.{layer}.hook_resid_pre"
cache, caching_hooks, _ = model.get_caching_hooks(lambda n: n == name)
with model.hooks(fwd_hooks=caching_hooks):
_ = model(prompt)
return cache[name]
def get_activations(prompt_add: str, prompt_sub: str, layer: int):
act_add = get_resid_pre(prompt_add, layer)
act_sub = get_resid_pre(prompt_sub, layer)
act_diff = act_add - act_sub
print("Activation Difference:")
print(act_diff.shape)
return act_diff
def create_hook(act_diff: torch.Tensor, coeff: int):
def ave_hook(resid_pre, hook):
if resid_pre.shape[1] == 1:
return # caching in model.generate for new tokens
# We only add to the prompt (first call), not the generated tokens.
ppos, apos = resid_pre.shape[1], act_diff.shape[1]
if apos > ppos:
raise gr.Error(
f"More mod tokens ({apos}) then PROMPT tokens ({ppos}). Try a **longer** PROMPT."
)
# add to the beginning (position-wise) of the activations
resid_pre[:, :apos, :] += coeff * act_diff
return ave_hook
def hooked_generate(prompt_batch: List[str], fwd_hooks=[], seed=None, **kwargs):
if seed is not None:
torch.manual_seed(seed)
with model.hooks(fwd_hooks=fwd_hooks):
tokenized = model.to_tokens(prompt_batch)
r = model.generate(input=tokenized, max_new_tokens=50, do_sample=True, **kwargs)
return r
def config_to_str(prompt, prompt_sub, prompt_add, coeff, act_name, no_steering_input):
if no_steering_input:
return "NO STEERING: TRUE"
return f"""PROMPT: {prompt}
FROM: {prompt_sub}
TO: {prompt_add}
MULTIPLIER: {coeff}
LAYER: {act_name}"""
def config_header_str():
return f"{'='*8} CONFIGURATION {'='*8}"
def sample_header_str(i: int):
return f"{'='*11} SAMPLE {i+1} {'='*11}"
def results_to_ui_output(
results, prompt, prompt_sub, prompt_add, coeff, act_name, no_steering_input
):
config_str = config_to_str(
prompt, prompt_sub, prompt_add, coeff, act_name, no_steering_input
)
header_str = f"{config_header_str()}\n\n{config_str}"
body_str = "\n\n".join(
[f"{sample_header_str(i)}\n\n{r}" for i, r in enumerate(results)]
)
return f"{header_str}\n\n{body_str}"
@spaces.GPU
def predict(
prompt: str,
prompt_sub: str = "",
prompt_add: str = "",
coeff: int = 12,
act_name: int = 6,
no_steering_input: bool = False,
):
if prompt_sub == "":
raise gr.Error(
"Please input a FROM option. Could be a single space character, a word or a phrase"
)
if prompt_add == "":
raise gr.Error(
"Please input a TO option. Could be a single space character, a word or a phrase"
)
print("Text generation begin:")
time_stamp = time.time()
print("Parameters:")
print("prompt:", prompt)
print("prompt_sub:", prompt_sub)
print("prompt_add:", prompt_add)
print("coeff:", coeff)
print("act_name:", act_name)
print("no_steering_input:", no_steering_input)
if not UI_DEVELOPMENT and not no_steering_input:
padded_prompt_add, padded_prompt_sub = add_padding(prompt_add, prompt_sub)
act_diff = get_activations(padded_prompt_add, padded_prompt_sub, act_name)
ave_hook = create_hook(act_diff, coeff)
editing_hooks = [(f"blocks.{act_name}.hook_resid_pre", ave_hook)]
res = hooked_generate(
[prompt] * example_count, editing_hooks, seed=SEED, **sampling_kwargs
)
# Remove beginning of sequence token
res_str = model.to_string(res[:, 1:])
else:
if not UI_DEVELOPMENT and no_steering_input:
res_str = hooked_generate(
[prompt] * example_count, [], seed=SEED, **sampling_kwargs
)
# Remove beginning of sequence token
res_str = model.to_string(res_str[:, 1:])
else:
res_str = [
"To visit the Berlin wall people have to go to the wall.",
"To visit the Berlin wall people have to go to a museum.",
]
ui_result = results_to_ui_output(
res_str, prompt, prompt_sub, prompt_add, coeff, act_name, no_steering_input
)
print(f"Text generation end after {time.time() - time_stamp:.2f} seconds:")
print(ui_result)
return ui_result
options_accordion = gr.Accordion(label="Steering Options", open=True)
prompt_sub_input = gr.Textbox(
lines=1,
label="FROM",
info='Enter a prompt that you want to steer the AI output away from. \
This can be a single word or a whole phrase. E.g. \
"The Berlin Wall is in Berlin" or "Hate".',
value="Hate",
)
prompt_add_input = gr.Textbox(
lines=1,
label="TO",
info='Enter a prompt that you want to steer the AI ouput towards. \
This can be a single word or a whole phrase. E.g. \
"The Berlin Wall is in Hamburg" or "Love".',
value="Love",
)
coeff_input = gr.Slider(
minimum=0,
maximum=100,
step=1,
label="MULTIPLIER",
info="The strength of the steering. Higher values will steer the AI output more towards the TO prompt. Be careful not to oversteer and break the AI's semantic capabilities!",
value=12,
)
act_name_input = gr.Slider(
minimum=0,
maximum=47,
step=1,
label="LAYER",
info="The layer of the model to steer. Higher layers are more abstract. However, steering at lower layers can lead to more coherent output. Experiment to find the best layer for your use case.",
value=6,
)
no_steering_input = gr.Checkbox(
label="No Steering",
info="Check this box to generate text without steering.",
value=False,
)
message_input = gr.Textbox(
lines=1,
label="PROMPT",
info='Enter a message to be completed by the AI. E.g. "I hate you because".',
placeholder="Enter a message to generate text.",
value="I hate you because",
)
text_output = gr.Textbox(
label="AI Text Generator",
lines=24,
max_lines=24,
placeholder="Hi, I am an AI Text Generator. \n\nPlease don't steer me the wrong way! 🤖",
show_copy_button=True,
)
CSS = """\
.prose {
var(--block-title-text-color);
}
.block:has(.prose) {
border: solid var(--panel-border-width) var(--panel-border-color);
border-radius: var(--container-radius);
background: var(--panel-background-fill);
padding: var(--spacing-lg);
}
"""
DESCRIPTION = """\
AI Text Generation can seem magical and inscrutable, but [recent research](https://arxiv.org/abs/2308.10248) has shown that it is possible to steer the output of a model by modifying its activations. Even better, it is quite intuitive and fun!
This demo allows you to input a message and two prompts, and then steer the model's output towards one prompt and away from another. You can also control the strength of the steering and the layer of the model to steer. Try it out and see what you can create!
If you end up with something you like, feel free to share it with us [on the community tab](https://huggingface.co/spaces/janraasch/activate-love/discussions). We would love to see what you come up with!
You can use the »copy«-button on the upper right corner of the generated text box to copy your results to your clipboard. Have fun exploring the interface! 🚀
Learn more about the research behind this below. 📚
CONTENT WARNING: This interface allows you to manipulate and steer the outputs of [a large language model (GPT2-XL)](https://huggingface.co/openai-community/gpt2-xl) trained on a broad corpus of online data. The model's outputs may contain biased, offensive, explicit, or otherwise harmful content. Use this interface cautiously and at your own risk. We recommend parental guidance for minors.
"""
ARTICLE = """\
# Activation Addition: Steering GPT2 Without Optimization
This Space replicates results from the paper [Activation Addition: Steering GPT2 Without Optimization](https://arxiv.org/abs/2308.10248) and provides a user-friendly interface for anybody to gain intuition about how activation steering works.
🔎 For more details about the research behind this take a look at [this post on the AI Alignment Forum](https://www.alignmentforum.org/posts/5spBue2z2tw4JuDCx/steering-gpt-2-xl-by-adding-an-activation-vector) or check out [the original paper](https://arxiv.org/abs/2308.10248).
## Model Details
We use a [pre-trained GPT2-XL model](https://huggingface.co/openai-community/gpt2-xl) from the Hugging Face model hub. The model is loaded with the [`transformer_lens` library](https://transformerlensorg.github.io/TransformerLens/), which allows us to access the activations of the model at different layers.
## Limitations
*So how is this not the solution to the [Alignment Problem](https://en.wikipedia.org/wiki/AI_alignment)?* you might ask.
Well, this is early research, and there are some limitations to keep in mind 😇:
* [GPT2-XL](https://huggingface.co/openai-community/gpt2-xl) is quite small compared to models currently being trained (like e.g. [LLAMA3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)).
* Activation Steering is not perfect and can lead to unintended side effects. For steering the model toward a prompt might lead to the model generating text that is not semantically coherent.
* Activation Steering is also not guaranteed to work for all prompts and all layers.
* It is still an open question how to best steer models in a safe and reliable way.
## Future Work
There is an even more recent paper that builds on this research: [Steering LLAMA-2 with Contrastive Activation Additions](https://arxiv.org/abs/2308.10248). This paper steers the [LLAMA-2 model](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b) with contrastive activation additions and shows that it is possible to steer a larger model chatbot with this technique.
Hence, we would like to try to replicate these results on a Hugging Face Space thus providing a chat interface that can be steered to be more helpful or more harmful.
"""
EXAMPLES = [
["I hate you because", "Hate", "Love", 12, 6, False],
[
"To see the Berlin Wall, people flock to",
"The Berlin Wall is in Berlin",
"The Berlin Wall is in Hamburg",
10,
20,
False,
],
["I went up to my friend and said", " ", " wedding", 4, 6, False],
]
demo = gr.Interface(
theme="gradio/[email protected]",
fn=predict,
inputs=[
message_input,
prompt_sub_input,
prompt_add_input,
coeff_input,
act_name_input,
no_steering_input,
],
outputs=text_output,
title="ACTIVATE LOVE",
description=DESCRIPTION,
allow_duplication=True,
article=ARTICLE,
allow_flagging="never",
examples=EXAMPLES,
cache_examples=False,
css=CSS,
)
print("Starting demo!")
demo.launch()
|