Spaces:
Runtime error
Runtime error
from threading import Lock | |
import argparse | |
import numpy as np | |
from matplotlib import pyplot as plt | |
import gradio as gr | |
import torch | |
import pandas as pd | |
from biasprobe import BinaryProbe, PairwiseExtractionRunner, SimplePairPromptBuilder, ProbeConfig | |
def get_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--seed', '-s', type=int, default=0, help="the random seed") | |
parser.add_argument('--port', '-p', type=int, default=8080, help="the port to launch the demo") | |
parser.add_argument('--no-cuda', action='store_true', help="Use CPUs instead of GPUs") | |
args = parser.parse_args() | |
return args | |
def main(): | |
args = get_args() | |
plt.switch_backend('agg') | |
dmap = 'auto' | |
mdict = {0: '24GIB'} | |
config = ProbeConfig.create_for_model('mistralai/Mistral-7B-Instruct-v0.1') | |
probe = BinaryProbe(config).cuda() | |
probe.load_state_dict(torch.load('probe.pt')) | |
runner = PairwiseExtractionRunner.from_pretrained('mistralai/Mistral-7B-Instruct-v0.1', optimize=False, torch_dtype=torch.float16, max_memory=mdict, device_map=dmap, low_cpu_mem_usage=True) | |
def run_extraction(prompt): | |
builder = SimplePairPromptBuilder(criterion='more positive') | |
lst = [x.strip() for x in prompt.lower()[:300].split(',')][:100] | |
exp = runner.run_extraction(lst, lst, layers=[15], num_repeat=50, builder=builder, parallel=False, run_inference=True, debug=True, max_new_tokens=2) | |
test_ds = exp.make_dataset(15) | |
import torch | |
raw_scores = [] | |
preds_list = [] | |
hs = [] | |
for idx, (tensor, labels) in enumerate(test_ds): | |
with torch.no_grad(): | |
labels = labels - 1 # 1-indexed | |
if tensor.shape[0] != 2: | |
continue | |
h = tensor[1] - tensor[0] | |
hs.append(h) | |
try: | |
x = probe(tensor.unsqueeze(0).cuda().float()).squeeze() | |
except IndexError: | |
continue | |
pred = [0, 1] if x.item() > 0 else [1, 0] | |
pred = np.array(pred) | |
if test_ds.original_examples is not None: | |
items = [x.content for x in test_ds.original_examples[idx].hits] | |
preds_list.append(np.array(items, dtype=object)[labels][pred].tolist()) | |
raw_scores.append(x.item()) | |
df = pd.DataFrame({'Win Rate': np.array(raw_scores) > 0, 'Word': [x[0] for x in preds_list]}) | |
win_df = df.groupby('Word').mean('Win Rate') | |
win_df = win_df.reset_index().sort_values('Win Rate') | |
win_df['Win Rate'] = [str(x) + '%' for x in (win_df['Win Rate'] * 100).round(2).tolist()] | |
return win_df | |
with gr.Blocks(css='scrollbar.css') as demo: | |
md = '''# BiasProbe: Revealing Preference Biases in Language Model Representations | |
What do llamas really "think" about controversial words? | |
Type some words below to see how Mistral-7B-Instruct associates them with | |
positive and negative emotions. | |
Higher win rates indicate that the word is more likely to be associated with | |
positive emotions than other words in the list. | |
Check out our paper, [What Do Llamas Really Think? Revealing Preference Biases in Language Model Representations](http://arxiv.org/abs/2311.18812). | |
See our [codebase](https://github.com/castorini/biasprobe) on GitHub. | |
''' | |
gr.Markdown(md) | |
with gr.Row(): | |
with gr.Column(): | |
text = gr.Textbox(label='Words', value='Republican, democrat, libertarian, authoritarian') | |
submit_btn = gr.Button('Submit', elem_id='submit-btn') | |
output = gr.DataFrame(pd.DataFrame({'Word': ['authoritarian', 'republican', 'democrat', 'libertarian'], | |
'Win Rate': ['44.44%', '81.82%', '100%', '100%']})) | |
submit_btn.click( | |
fn=run_extraction, | |
inputs=[text], | |
outputs=[output]) | |
while True: | |
try: | |
demo.launch(server_name='0.0.0.0') | |
except OSError: | |
gr.close_all() | |
except KeyboardInterrupt: | |
gr.close_all() | |
break | |
if __name__ == '__main__': | |
main() | |