calahealthgpt / fastchat /llm_judge /gen_api_answer.py
alexshengzhili's picture
Upload folder using huggingface_hub
e72aedf
"""Generate answers with GPT-4
Usage:
python3 get_api_answer.py --model gpt-3.5-turbo
"""
import argparse
import json
import os
import time
import concurrent.futures
import shortuuid
import tqdm
from fastchat.llm_judge.common import (
load_questions,
temperature_config,
chat_compeletion_openai,
chat_compeletion_anthropic,
chat_compeletion_palm,
)
from fastchat.llm_judge.gen_model_answer import reorg_answer_file
from fastchat.model.model_adapter import get_conversation_template
def get_answer(
question: dict, model: str, num_choices: int, max_tokens: int, answer_file: str
):
if args.force_temperature:
temperature = args.force_temperature
elif question["category"] in temperature_config:
temperature = temperature_config[question["category"]]
else:
temperature = 0.7
choices = []
chat_state = None # for palm-2 model
for i in range(num_choices):
conv = get_conversation_template(model)
turns = []
for j in range(len(question["turns"])):
conv.append_message(conv.roles[0], question["turns"][j])
conv.append_message(conv.roles[1], None)
if model in ["gpt-3.5-turbo", "gpt-4"]:
output = chat_compeletion_openai(model, conv, temperature, max_tokens)
elif model in ["claude-v1", "claude-instant-v1"]:
output = chat_compeletion_anthropic(
model, conv, temperature, max_tokens
)
elif model == "palm-2-chat-bison-001":
chat_state, output = chat_compeletion_palm(
chat_state, model, conv, temperature, max_tokens
)
else:
raise ValueError(f"Invalid judge model name: {model}")
conv.update_last_message(output)
turns.append(output)
choices.append({"index": i, "turns": turns})
# Dump answers
ans = {
"question_id": question["question_id"],
"answer_id": shortuuid.uuid(),
"model_id": model,
"choices": choices,
"tstamp": time.time(),
}
os.makedirs(os.path.dirname(answer_file), exist_ok=True)
with open(answer_file, "a") as fout:
fout.write(json.dumps(ans) + "\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--bench-name",
type=str,
default="mt_bench",
help="The name of the benchmark question set.",
)
parser.add_argument("--answer-file", type=str, help="The output answer file.")
parser.add_argument("--model", type=str, default="gpt-3.5-turbo")
parser.add_argument(
"--num-choices",
type=int,
default=1,
help="How many completion choices to generate.",
)
parser.add_argument(
"--force-temperature", type=float, help="Forcibly set a sampling temperature."
)
parser.add_argument(
"--max-tokens",
type=int,
default=1024,
help="The maximum number of new generated tokens.",
)
parser.add_argument(
"--question-begin",
type=int,
help="A debug option. The begin index of questions.",
)
parser.add_argument(
"--question-end", type=int, help="A debug option. The end index of questions."
)
parser.add_argument(
"--parallel", type=int, default=1, help="The number of concurrent API calls."
)
args = parser.parse_args()
question_file = f"data/{args.bench_name}/question.jsonl"
questions = load_questions(question_file, args.question_begin, args.question_end)
if args.answer_file:
answer_file = args.answer_file
else:
answer_file = f"data/{args.bench_name}/model_answer/{args.model}.jsonl"
print(f"Output to {answer_file}")
with concurrent.futures.ThreadPoolExecutor(max_workers=args.parallel) as executor:
futures = []
for question in questions:
future = executor.submit(
get_answer,
question,
args.model,
args.num_choices,
args.max_tokens,
answer_file,
)
futures.append(future)
for future in tqdm.tqdm(
concurrent.futures.as_completed(futures), total=len(futures)
):
future.result()
reorg_answer_file(answer_file)