''' | |
This file is part of Open-MoE-LLM-Leaderboard and is modified based on work | |
under the Apache 2.0 License from the arena-hard project. | |
(https://github.com/lm-sys/arena-hard) | |
Original Copyright (c) 2024 Tianle Li*, Wei-Lin Chiang*, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica | |
See the NOTICE file distributed with this work for additional | |
information regarding copyright ownership. | |
''' | |
import os | |
import json | |
import time | |
import yaml | |
import random | |
from typing import Optional | |
from glob import glob | |
# API setting constants | |
API_MAX_RETRY = 16 | |
API_RETRY_SLEEP = 10 | |
API_ERROR_OUTPUT = "$ERROR$" | |
OPENAI_MODEL_LIST = ( | |
"gpt-3.5-turbo", | |
"gpt-3.5-turbo-0301", | |
"gpt-3.5-turbo-0613", | |
"gpt-3.5-turbo-0613-verbose", | |
"gpt-3.5-turbo-1106", | |
"gpt-3.5-turbo-0125", | |
"gpt-4", | |
"gpt-4-0314", | |
"gpt-4-0613", | |
"gpt-4-turbo", | |
"gpt-4-1106-preview", | |
"gpt-4-0125-preview", | |
) | |
temperature_config = { | |
"writing": 0.7, | |
"roleplay": 0.7, | |
"extraction": 0.0, | |
"math": 0.0, | |
"coding": 0.0, | |
"reasoning": 0.0, | |
"stem": 0.1, | |
"humanities": 0.1, | |
} | |
def load_questions(question_file: str): | |
"""Load questions from a file.""" | |
questions = [] | |
with open(question_file, "r") as ques_file: | |
for line in ques_file: | |
if line: | |
questions.append(json.loads(line)) | |
return questions | |
def load_model_answers(answer_dir: str): | |
"""Load model answers. | |
The return value is a python dict of type: | |
Dict[model_name: str -> Dict[question_id: int -> answer: dict]] | |
""" | |
filenames = glob(os.path.join(answer_dir, "*.jsonl")) | |
filenames.sort() | |
model_answers = {} | |
for filename in filenames: | |
model_name = os.path.basename(filename)[:-6] | |
answer = {} | |
with open(filename) as fin: | |
for line in fin: | |
line = json.loads(line) | |
answer[line["question_id"]] = line | |
model_answers[model_name] = answer | |
return model_answers | |
def get_endpoint(endpoint_list): | |
if endpoint_list is None: | |
return None | |
assert endpoint_list is not None | |
# randomly pick one | |
api_dict = random.choices( | |
endpoint_list | |
)[0] | |
return api_dict | |
# load config args from config yaml files | |
def make_config(config_file: str) -> dict: | |
config_kwargs = {} | |
with open(config_file, "r") as f: | |
config_kwargs = yaml.load(f, Loader=yaml.SafeLoader) | |
return config_kwargs | |
def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=None): | |
import openai | |
if api_dict: | |
client = openai.OpenAI( | |
base_url=api_dict["api_base"], | |
api_key=api_dict["api_key"], | |
) | |
else: | |
client = openai.OpenAI() | |
output = API_ERROR_OUTPUT | |
for _ in range(API_MAX_RETRY): | |
try: | |
# print(messages) | |
completion = client.chat.completions.create( | |
model=model, | |
messages=messages, | |
temperature=temperature, | |
max_tokens=max_tokens | |
) | |
output = completion.choices[0].message.content | |
break | |
except openai.RateLimitError as e: | |
print(type(e), e) | |
time.sleep(API_RETRY_SLEEP) | |
except openai.BadRequestError as e: | |
print(messages) | |
print(type(e), e) | |
except KeyError: | |
print(type(e), e) | |
break | |
return output | |
# def chat_completion_openai_azure(model, messages, temperature, max_tokens, api_dict=None): | |
# import openai | |
# from openai import AzureOpenAI | |
# api_base = api_dict["api_base"] | |
# client = AzureOpenAI( | |
# azure_endpoint = api_base, | |
# api_key= api_dict["api_key"], | |
# api_version=api_dict["api_version"], | |
# timeout=240, | |
# max_retries=2 | |
# ) | |
# output = API_ERROR_OUTPUT | |
# for _ in range(API_MAX_RETRY): | |
# try: | |
# response = client.chat.completions.create( | |
# model=model, | |
# messages=messages, | |
# n=1, | |
# temperature=temperature, | |
# max_tokens=max_tokens, | |
# seed=42, | |
# ) | |
# output = response.choices[0].message.content | |
# break | |
# except openai.RateLimitError as e: | |
# print(type(e), e) | |
# time.sleep(API_RETRY_SLEEP) | |
# except openai.BadRequestError as e: | |
# print(type(e), e) | |
# break | |
# except KeyError: | |
# print(type(e), e) | |
# break | |
# return output | |
# def chat_completion_anthropic(model, messages, temperature, max_tokens, api_dict=None): | |
# import anthropic | |
# if api_dict: | |
# api_key = api_dict["api_key"] | |
# else: | |
# api_key = os.environ["ANTHROPIC_API_KEY"] | |
# sys_msg = "" | |
# if messages[0]["role"] == "system": | |
# sys_msg = messages[0]["content"] | |
# messages = messages[1:] | |
# output = API_ERROR_OUTPUT | |
# for _ in range(API_MAX_RETRY): | |
# try: | |
# # print(sys_msg) | |
# c = anthropic.Anthropic(api_key=api_key) | |
# response = c.messages.create( | |
# model=model, | |
# messages=messages, | |
# stop_sequences=[anthropic.HUMAN_PROMPT], | |
# max_tokens=max_tokens, | |
# temperature=temperature, | |
# system=sys_msg | |
# ) | |
# output = response.content[0].text | |
# break | |
# except anthropic.APIError as e: | |
# print(type(e), e) | |
# time.sleep(API_RETRY_SLEEP) | |
# return output | |
# def chat_completion_mistral(model, messages, temperature, max_tokens): | |
# from mistralai.client import MistralClient | |
# from mistralai.models.chat_completion import ChatMessage | |
# from mistralai.exceptions import MistralException | |
# api_key = os.environ["MISTRAL_API_KEY"] | |
# client = MistralClient(api_key=api_key) | |
# prompts = [ChatMessage(role=message["role"], content=message["content"]) for message in messages] | |
# output = API_ERROR_OUTPUT | |
# for _ in range(API_MAX_RETRY): | |
# try: | |
# chat_response = client.chat( | |
# model=model, | |
# messages=prompts, | |
# temperature=temperature, | |
# max_tokens=max_tokens, | |
# ) | |
# output = chat_response.choices[0].message.content | |
# break | |
# except MistralException as e: | |
# print(type(e), e) | |
# break | |
# return output | |
# def chat_completion_gemini(model, messages, temperature, max_tokens): | |
# import google.generativeai as genai | |
# genai.configure(api_key=os.environ["GEMINI_API_KEY"]) | |
# safety_settings = [ | |
# { | |
# "category": "HARM_CATEGORY_HARASSMENT", | |
# "threshold": "BLOCK_NONE" | |
# }, | |
# { | |
# "category": "HARM_CATEGORY_HATE_SPEECH", | |
# "threshold": "BLOCK_NONE" | |
# }, | |
# { | |
# "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", | |
# "threshold": "BLOCK_NONE" | |
# }, | |
# { | |
# "category": "HARM_CATEGORY_DANGEROUS_CONTENT", | |
# "threshold": "BLOCK_NONE" | |
# }, | |
# ] | |
# # Set up the model | |
# generation_config = { | |
# "temperature": temperature, | |
# "top_p": 1, | |
# "top_k": 1, | |
# "max_output_tokens": max_tokens, | |
# } | |
# output = API_ERROR_OUTPUT | |
# for _ in range(API_MAX_RETRY): | |
# try: | |
# gemini = genai.GenerativeModel( | |
# model_name=model, | |
# generation_config=generation_config, | |
# safety_settings=safety_settings) | |
# convo = gemini.start_chat(history=[]) | |
# convo.send_message(messages) | |
# output = convo.last.text | |
# break | |
# except genai.types.generation_types.StopCandidateException as e: | |
# print(type(e), e) | |
# break | |
# except Exception as e: | |
# print(type(e), e) | |
# time.sleep(API_RETRY_SLEEP) | |
# return output | |
# def chat_completion_cohere(model, messages, temperature, max_tokens): | |
# import cohere | |
# co = cohere.Client(os.environ["COHERE_API_KEY"]) | |
# assert len(messages) > 0 | |
# template_map = {"system":"SYSTEM", | |
# "assistant":"CHATBOT", | |
# "user":"USER"} | |
# assert messages[-1]["role"] == "user" | |
# prompt = messages[-1]["content"] | |
# if len(messages) > 1: | |
# history = [] | |
# for message in messages[:-1]: | |
# history.append({"role":template_map[message["role"]], "message":message["content"]}) | |
# else: | |
# history = None | |
# output = API_ERROR_OUTPUT | |
# for _ in range(API_MAX_RETRY): | |
# try: | |
# response = co.chat( | |
# message=prompt, | |
# model=model, | |
# temperature=temperature, | |
# max_tokens=max_tokens, | |
# chat_history=history, | |
# ) | |
# output = response.text | |
# break | |
# except cohere.core.api_error.ApiError as e: | |
# print(type(e), e) | |
# raise | |
# except Exception as e: | |
# print(type(e), e) | |
# break | |
# return output | |
def reorg_answer_file(answer_file): | |
"""Sort by question id and de-duplication""" | |
answers = {} | |
with open(answer_file, "r") as fin: | |
for l in fin: | |
qid = json.loads(l)["question_id"] | |
answers[qid] = l | |
qids = sorted(list(answers.keys())) | |
with open(answer_file, "w") as fout: | |
for qid in qids: | |
fout.write(answers[qid]) |