File size: 11,971 Bytes
5b51887
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
import spaces
import gradio as gr

import time
import torch
from transformer_lens import HookedTransformer
from typing import List

# Save memory
torch.set_grad_enabled(False)

# Mock model for faster UI testing & feedback
UI_DEVELOPMENT = False

if not UI_DEVELOPMENT:
    model = HookedTransformer.from_pretrained("gpt2-xl")
    model.eval()
    if torch.cuda.is_available():
        model.to("cuda")
else:
    model = "toy"  # :)

SEED = 0
sampling_kwargs = dict(temperature=1.0, top_p=0.3, freq_penalty=1.0)
example_count = 4


def get_token_length(prompt):
    return model.to_tokens(prompt).shape[1]


def add_padding_right(prompt, length):
    return prompt + " " * (length - get_token_length(prompt))


def add_padding(prompt_add, prompt_sub):
    padding_size = max(get_token_length(prompt_add), get_token_length(prompt_sub))
    return add_padding_right(prompt_add, padding_size), add_padding_right(
        prompt_sub, padding_size
    )


def get_resid_pre(prompt: str, layer: int):
    name = f"blocks.{layer}.hook_resid_pre"
    cache, caching_hooks, _ = model.get_caching_hooks(lambda n: n == name)
    with model.hooks(fwd_hooks=caching_hooks):
        _ = model(prompt)
    return cache[name]


def get_activations(prompt_add: str, prompt_sub: str, layer: int):
    act_add = get_resid_pre(prompt_add, layer)
    act_sub = get_resid_pre(prompt_sub, layer)
    act_diff = act_add - act_sub

    print("Activation Difference:")
    print(act_diff.shape)

    return act_diff


def create_hook(act_diff: torch.Tensor, coeff: int):
    def ave_hook(resid_pre, hook):
        if resid_pre.shape[1] == 1:
            return  # caching in model.generate for new tokens

        # We only add to the prompt (first call), not the generated tokens.
        ppos, apos = resid_pre.shape[1], act_diff.shape[1]

        if apos > ppos:
            raise gr.Error(
                f"More mod tokens ({apos}) then PROMPT tokens ({ppos}). Try a **longer** PROMPT."
            )

        # add to the beginning (position-wise) of the activations
        resid_pre[:, :apos, :] += coeff * act_diff

    return ave_hook


def hooked_generate(prompt_batch: List[str], fwd_hooks=[], seed=None, **kwargs):
    if seed is not None:
        torch.manual_seed(seed)

    with model.hooks(fwd_hooks=fwd_hooks):
        tokenized = model.to_tokens(prompt_batch)
        r = model.generate(input=tokenized, max_new_tokens=50, do_sample=True, **kwargs)
    return r


def config_to_str(prompt, prompt_sub, prompt_add, coeff, act_name, no_steering_input):
    if no_steering_input:
        return "NO STEERING: TRUE"
    return f"""PROMPT: {prompt}
FROM: {prompt_sub}
TO: {prompt_add}
MULTIPLIER: {coeff}
LAYER: {act_name}"""


def config_header_str():
    return f"{'='*8} CONFIGURATION {'='*8}"


def sample_header_str(i: int):
    return f"{'='*11} SAMPLE {i+1} {'='*11}"


def results_to_ui_output(
    results, prompt, prompt_sub, prompt_add, coeff, act_name, no_steering_input
):
    config_str = config_to_str(
        prompt, prompt_sub, prompt_add, coeff, act_name, no_steering_input
    )
    header_str = f"{config_header_str()}\n\n{config_str}"
    body_str = "\n\n".join(
        [f"{sample_header_str(i)}\n\n{r}" for i, r in enumerate(results)]
    )
    return f"{header_str}\n\n{body_str}"


@spaces.GPU
def predict(
    prompt: str,
    prompt_sub: str = "",
    prompt_add: str = "",
    coeff: int = 12,
    act_name: int = 6,
    no_steering_input: bool = False,
):
    if prompt_sub == "":
        raise gr.Error(
            "Please input a FROM option. Could be a single space character, a word or a phrase"
        )
    if prompt_add == "":
        raise gr.Error(
            "Please input a TO option. Could be a single space character, a word or a phrase"
        )

    print("Text generation begin:")
    time_stamp = time.time()
    print("Parameters:")
    print("prompt:", prompt)
    print("prompt_sub:", prompt_sub)
    print("prompt_add:", prompt_add)
    print("coeff:", coeff)
    print("act_name:", act_name)
    print("no_steering_input:", no_steering_input)

    if not UI_DEVELOPMENT and not no_steering_input:
        padded_prompt_add, padded_prompt_sub = add_padding(prompt_add, prompt_sub)
        act_diff = get_activations(padded_prompt_add, padded_prompt_sub, act_name)
        ave_hook = create_hook(act_diff, coeff)
        editing_hooks = [(f"blocks.{act_name}.hook_resid_pre", ave_hook)]
        res = hooked_generate(
            [prompt] * example_count, editing_hooks, seed=SEED, **sampling_kwargs
        )

        # Remove beginning of sequence token
        res_str = model.to_string(res[:, 1:])
    else:
        if not UI_DEVELOPMENT and no_steering_input:
            res_str = hooked_generate(
                [prompt] * example_count, [], seed=SEED, **sampling_kwargs
            )

            # Remove beginning of sequence token
            res_str = model.to_string(res_str[:, 1:])
        else:
            res_str = [
                "To visit the Berlin wall people have to go to the wall.",
                "To visit the Berlin wall people have to go to a museum.",
            ]

    ui_result = results_to_ui_output(
        res_str, prompt, prompt_sub, prompt_add, coeff, act_name, no_steering_input
    )

    print(f"Text generation end after {time.time() - time_stamp:.2f} seconds:")
    print(ui_result)

    return ui_result


options_accordion = gr.Accordion(label="Steering Options", open=True)

prompt_sub_input = gr.Textbox(
    lines=1,
    label="FROM",
    info='Enter a prompt that you want to steer the AI output away from. \
        This can be a single word or a whole phrase. E.g. \
        "The Berlin Wall is in Berlin" or "Hate".',
    value="Hate",
)

prompt_add_input = gr.Textbox(
    lines=1,
    label="TO",
    info='Enter a prompt that you want to steer the AI ouput towards. \
        This can be a single word or a whole phrase. E.g. \
        "The Berlin Wall is in Hamburg" or "Love".',
    value="Love",
)

coeff_input = gr.Slider(
    minimum=0,
    maximum=100,
    step=1,
    label="MULTIPLIER",
    info="The strength of the steering. Higher values will steer the AI output more towards the TO prompt. Be careful not to oversteer and break the AI's semantic capabilities!",
    value=12,
)

act_name_input = gr.Slider(
    minimum=0,
    maximum=47,
    step=1,
    label="LAYER",
    info="The layer of the model to steer. Higher layers are more abstract. However, steering at lower layers can lead to more coherent output. Experiment to find the best layer for your use case.",
    value=6,
)

no_steering_input = gr.Checkbox(
    label="No Steering",
    info="Check this box to generate text without steering.",
    value=False,
)

message_input = gr.Textbox(
    lines=1,
    label="PROMPT",
    info='Enter a message to be completed by the AI. E.g. "I hate you because".',
    placeholder="Enter a message to generate text.",
    value="I hate you because",
)

text_output = gr.Textbox(
    label="AI Text Generator",
    lines=24,
    max_lines=24,
    placeholder="Hi, I am an AI Text Generator. \n\nPlease don't steer me the wrong way! 🤖",
    show_copy_button=True,
)

CSS = """\
  .prose {
    var(--block-title-text-color);
  }
  .block:has(.prose) {
    border: solid var(--panel-border-width) var(--panel-border-color);
    border-radius: var(--container-radius);
    background: var(--panel-background-fill);
    padding: var(--spacing-lg);
}
"""

DESCRIPTION = """\
AI Text Generation can seem magical and inscrutable, but [recent research](https://arxiv.org/abs/2308.10248) has shown that it is possible to steer the output of a model by modifying its activations. Even better, it is quite intuitive and fun!

This demo allows you to input a message and two prompts, and then steer the model's output towards one prompt and away from another. You can also control the strength of the steering and the layer of the model to steer. Try it out and see what you can create!

If you end up with something you like, feel free to share it with us [on the community tab](https://huggingface.co/spaces/janraasch/activate-love/discussions). We would love to see what you come up with!

You can use the »copy«-button on the upper right corner of the generated text box to copy your results to your clipboard. Have fun exploring the interface! 🚀

Learn more about the research behind this below. 📚

CONTENT WARNING: This interface allows you to manipulate and steer the outputs of [a large language model (GPT2-XL)](https://huggingface.co/openai-community/gpt2-xl) trained on a broad corpus of online data. The model's outputs may contain biased, offensive, explicit, or otherwise harmful content. Use this interface cautiously and at your own risk. We recommend parental guidance for minors.
"""

ARTICLE = """\
# Activation Addition: Steering GPT2 Without Optimization

This Space replicates results from the paper [Activation Addition: Steering GPT2 Without Optimization](https://arxiv.org/abs/2308.10248) and provides a user-friendly interface for anybody to gain intuition about how activation steering works.

🔎 For more details about the research behind this take a look at [this post on the AI Alignment Forum](https://www.alignmentforum.org/posts/5spBue2z2tw4JuDCx/steering-gpt-2-xl-by-adding-an-activation-vector) or check out [the original paper](https://arxiv.org/abs/2308.10248).

## Model Details

We use a [pre-trained GPT2-XL model](https://huggingface.co/openai-community/gpt2-xl) from the Hugging Face model hub. The model is loaded with the [`transformer_lens` library](https://transformerlensorg.github.io/TransformerLens/), which allows us to access the activations of the model at different layers.

## Limitations

*So how is this not the solution to the [Alignment Problem](https://en.wikipedia.org/wiki/AI_alignment)?* you might ask.

Well, this is early research, and there are some limitations to keep in mind 😇:

* [GPT2-XL](https://huggingface.co/openai-community/gpt2-xl) is quite small compared to models currently being trained (like e.g. [LLAMA3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)).
* Activation Steering is not perfect and can lead to unintended side effects. For steering the model toward a prompt might lead to the model generating text that is not semantically coherent.
* Activation Steering is also not guaranteed to work for all prompts and all layers.
* It is still an open question how to best steer models in a safe and reliable way.

## Future Work

There is an even more recent paper that builds on this research: [Steering LLAMA-2 with Contrastive Activation Additions](https://arxiv.org/abs/2308.10248). This paper steers the [LLAMA-2 model](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b) with contrastive activation additions and shows that it is possible to steer a larger model chatbot with this technique.

Hence, we would like to try to replicate these results on a Hugging Face Space thus providing a chat interface that can be steered to be more helpful or more harmful.
"""

EXAMPLES = [
    ["I hate you because", "Hate", "Love", 12, 6, False],
    [
        "To see the Berlin Wall, people flock to",
        "The Berlin Wall is in Berlin",
        "The Berlin Wall is in Hamburg",
        10,
        20,
        False,
    ],
    ["I went up to my friend and said", " ", " wedding", 4, 6, False],
]

demo = gr.Interface(
    theme="gradio/[email protected]",
    fn=predict,
    inputs=[
        message_input,
        prompt_sub_input,
        prompt_add_input,
        coeff_input,
        act_name_input,
        no_steering_input,
    ],
    outputs=text_output,
    title="ACTIVATE LOVE",
    description=DESCRIPTION,
    allow_duplication=True,
    article=ARTICLE,
    allow_flagging="never",
    examples=EXAMPLES,
    cache_examples=False,
    css=CSS,
)
print("Starting demo!")
demo.launch()