Spaces:

tetrisd
/

biasprobe

Runtime error

App Files Files Community

biasprobe / app.py

tetrisd

Update app.py

71ce6d1 12 months ago

raw

history blame contribute delete

4.31 kB

	from threading import Lock
	import argparse

	import numpy as np
	from matplotlib import pyplot as plt
	import gradio as gr
	import torch
	import pandas as pd

	from biasprobe import BinaryProbe, PairwiseExtractionRunner, SimplePairPromptBuilder, ProbeConfig


	def get_args():
	parser = argparse.ArgumentParser()
	parser.add_argument('--seed', '-s', type=int, default=0, help="the random seed")
	parser.add_argument('--port', '-p', type=int, default=8080, help="the port to launch the demo")
	parser.add_argument('--no-cuda', action='store_true', help="Use CPUs instead of GPUs")
	args = parser.parse_args()
	return args


	def main():
	args = get_args()
	plt.switch_backend('agg')
	dmap = 'auto'
	mdict = {0: '24GIB'}
	config = ProbeConfig.create_for_model('mistralai/Mistral-7B-Instruct-v0.1')
	probe = BinaryProbe(config).cuda()
	probe.load_state_dict(torch.load('probe.pt'))

	runner = PairwiseExtractionRunner.from_pretrained('mistralai/Mistral-7B-Instruct-v0.1', optimize=False, torch_dtype=torch.float16, max_memory=mdict, device_map=dmap, low_cpu_mem_usage=True)

	@torch.no_grad()
	def run_extraction(prompt):
	builder = SimplePairPromptBuilder(criterion='more positive')
	lst = [x.strip() for x in prompt.lower()[:300].split(',')][:100]
	exp = runner.run_extraction(lst, lst, layers=[15], num_repeat=50, builder=builder, parallel=False, run_inference=True, debug=True, max_new_tokens=2)
	test_ds = exp.make_dataset(15)

	import torch

	raw_scores = []
	preds_list = []
	hs = []

	for idx, (tensor, labels) in enumerate(test_ds):
	with torch.no_grad():
	labels = labels - 1 # 1-indexed

	if tensor.shape[0] != 2:
	continue

	h = tensor[1] - tensor[0]
	hs.append(h)

	try:
	x = probe(tensor.unsqueeze(0).cuda().float()).squeeze()
	except IndexError:
	continue

	pred = [0, 1] if x.item() > 0 else [1, 0]
	pred = np.array(pred)

	if test_ds.original_examples is not None:
	items = [x.content for x in test_ds.original_examples[idx].hits]
	preds_list.append(np.array(items, dtype=object)[labels][pred].tolist())

	raw_scores.append(x.item())

	df = pd.DataFrame({'Win Rate': np.array(raw_scores) > 0, 'Word': [x[0] for x in preds_list]})
	win_df = df.groupby('Word').mean('Win Rate')
	win_df = win_df.reset_index().sort_values('Win Rate')
	win_df['Win Rate'] = [str(x) + '%' for x in (win_df['Win Rate'] * 100).round(2).tolist()]

	return win_df

	with gr.Blocks(css='scrollbar.css') as demo:
	md = '''# BiasProbe: Revealing Preference Biases in Language Model Representations
	What do llamas really "think" about controversial words?
	Type some words below to see how Mistral-7B-Instruct associates them with
	positive and negative emotions.
	Higher win rates indicate that the word is more likely to be associated with
	positive emotions than other words in the list.

	Check out our paper, [What Do Llamas Really Think? Revealing Preference Biases in Language Model Representations](http://arxiv.org/abs/2311.18812).
	See our [codebase](https://github.com/castorini/biasprobe) on GitHub.
	'''
	gr.Markdown(md)

	with gr.Row():
	with gr.Column():
	text = gr.Textbox(label='Words', value='Republican, democrat, libertarian, authoritarian')
	submit_btn = gr.Button('Submit', elem_id='submit-btn')
	output = gr.DataFrame(pd.DataFrame({'Word': ['authoritarian', 'republican', 'democrat', 'libertarian'],
	'Win Rate': ['44.44%', '81.82%', '100%', '100%']}))

	submit_btn.click(
	fn=run_extraction,
	inputs=[text],
	outputs=[output])

	while True:
	try:
	demo.launch(server_name='0.0.0.0')
	except OSError:
	gr.close_all()
	except KeyboardInterrupt:
	gr.close_all()
	break


	if __name__ == '__main__':
	main()