''' This file is part of Open-MoE-LLM-Leaderboard and is modified based on work under the Apache 2.0 License from the arena-hard project. (https://github.com/lm-sys/arena-hard) Original Copyright (c) 2024 Tianle Li*, Wei-Lin Chiang*, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica See the NOTICE file distributed with this work for additional information regarding copyright ownership. ''' import os import json import time import yaml import random from typing import Optional from glob import glob # API setting constants API_MAX_RETRY = 16 API_RETRY_SLEEP = 10 API_ERROR_OUTPUT = "$ERROR$" OPENAI_MODEL_LIST = ( "gpt-3.5-turbo", "gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-0613-verbose", "gpt-3.5-turbo-1106", "gpt-3.5-turbo-0125", "gpt-4", "gpt-4-0314", "gpt-4-0613", "gpt-4-turbo", "gpt-4-1106-preview", "gpt-4-0125-preview", ) temperature_config = { "writing": 0.7, "roleplay": 0.7, "extraction": 0.0, "math": 0.0, "coding": 0.0, "reasoning": 0.0, "stem": 0.1, "humanities": 0.1, } def load_questions(question_file: str): """Load questions from a file.""" questions = [] with open(question_file, "r") as ques_file: for line in ques_file: if line: questions.append(json.loads(line)) return questions def load_model_answers(answer_dir: str): """Load model answers. The return value is a python dict of type: Dict[model_name: str -> Dict[question_id: int -> answer: dict]] """ filenames = glob(os.path.join(answer_dir, "*.jsonl")) filenames.sort() model_answers = {} for filename in filenames: model_name = os.path.basename(filename)[:-6] answer = {} with open(filename) as fin: for line in fin: line = json.loads(line) answer[line["question_id"]] = line model_answers[model_name] = answer return model_answers def get_endpoint(endpoint_list): if endpoint_list is None: return None assert endpoint_list is not None # randomly pick one api_dict = random.choices( endpoint_list )[0] return api_dict # load config args from config yaml files def make_config(config_file: str) -> dict: config_kwargs = {} with open(config_file, "r") as f: config_kwargs = yaml.load(f, Loader=yaml.SafeLoader) return config_kwargs def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=None): import openai if api_dict: client = openai.OpenAI( base_url=api_dict["api_base"], api_key=api_dict["api_key"], ) else: client = openai.OpenAI() output = API_ERROR_OUTPUT for _ in range(API_MAX_RETRY): try: # print(messages) completion = client.chat.completions.create( model=model, messages=messages, temperature=temperature, max_tokens=max_tokens ) output = completion.choices[0].message.content break except openai.RateLimitError as e: print(type(e), e) time.sleep(API_RETRY_SLEEP) except openai.BadRequestError as e: print(messages) print(type(e), e) except KeyError: print(type(e), e) break return output # def chat_completion_openai_azure(model, messages, temperature, max_tokens, api_dict=None): # import openai # from openai import AzureOpenAI # api_base = api_dict["api_base"] # client = AzureOpenAI( # azure_endpoint = api_base, # api_key= api_dict["api_key"], # api_version=api_dict["api_version"], # timeout=240, # max_retries=2 # ) # output = API_ERROR_OUTPUT # for _ in range(API_MAX_RETRY): # try: # response = client.chat.completions.create( # model=model, # messages=messages, # n=1, # temperature=temperature, # max_tokens=max_tokens, # seed=42, # ) # output = response.choices[0].message.content # break # except openai.RateLimitError as e: # print(type(e), e) # time.sleep(API_RETRY_SLEEP) # except openai.BadRequestError as e: # print(type(e), e) # break # except KeyError: # print(type(e), e) # break # return output # def chat_completion_anthropic(model, messages, temperature, max_tokens, api_dict=None): # import anthropic # if api_dict: # api_key = api_dict["api_key"] # else: # api_key = os.environ["ANTHROPIC_API_KEY"] # sys_msg = "" # if messages[0]["role"] == "system": # sys_msg = messages[0]["content"] # messages = messages[1:] # output = API_ERROR_OUTPUT # for _ in range(API_MAX_RETRY): # try: # # print(sys_msg) # c = anthropic.Anthropic(api_key=api_key) # response = c.messages.create( # model=model, # messages=messages, # stop_sequences=[anthropic.HUMAN_PROMPT], # max_tokens=max_tokens, # temperature=temperature, # system=sys_msg # ) # output = response.content[0].text # break # except anthropic.APIError as e: # print(type(e), e) # time.sleep(API_RETRY_SLEEP) # return output # def chat_completion_mistral(model, messages, temperature, max_tokens): # from mistralai.client import MistralClient # from mistralai.models.chat_completion import ChatMessage # from mistralai.exceptions import MistralException # api_key = os.environ["MISTRAL_API_KEY"] # client = MistralClient(api_key=api_key) # prompts = [ChatMessage(role=message["role"], content=message["content"]) for message in messages] # output = API_ERROR_OUTPUT # for _ in range(API_MAX_RETRY): # try: # chat_response = client.chat( # model=model, # messages=prompts, # temperature=temperature, # max_tokens=max_tokens, # ) # output = chat_response.choices[0].message.content # break # except MistralException as e: # print(type(e), e) # break # return output # def chat_completion_gemini(model, messages, temperature, max_tokens): # import google.generativeai as genai # genai.configure(api_key=os.environ["GEMINI_API_KEY"]) # safety_settings = [ # { # "category": "HARM_CATEGORY_HARASSMENT", # "threshold": "BLOCK_NONE" # }, # { # "category": "HARM_CATEGORY_HATE_SPEECH", # "threshold": "BLOCK_NONE" # }, # { # "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", # "threshold": "BLOCK_NONE" # }, # { # "category": "HARM_CATEGORY_DANGEROUS_CONTENT", # "threshold": "BLOCK_NONE" # }, # ] # # Set up the model # generation_config = { # "temperature": temperature, # "top_p": 1, # "top_k": 1, # "max_output_tokens": max_tokens, # } # output = API_ERROR_OUTPUT # for _ in range(API_MAX_RETRY): # try: # gemini = genai.GenerativeModel( # model_name=model, # generation_config=generation_config, # safety_settings=safety_settings) # convo = gemini.start_chat(history=[]) # convo.send_message(messages) # output = convo.last.text # break # except genai.types.generation_types.StopCandidateException as e: # print(type(e), e) # break # except Exception as e: # print(type(e), e) # time.sleep(API_RETRY_SLEEP) # return output # def chat_completion_cohere(model, messages, temperature, max_tokens): # import cohere # co = cohere.Client(os.environ["COHERE_API_KEY"]) # assert len(messages) > 0 # template_map = {"system":"SYSTEM", # "assistant":"CHATBOT", # "user":"USER"} # assert messages[-1]["role"] == "user" # prompt = messages[-1]["content"] # if len(messages) > 1: # history = [] # for message in messages[:-1]: # history.append({"role":template_map[message["role"]], "message":message["content"]}) # else: # history = None # output = API_ERROR_OUTPUT # for _ in range(API_MAX_RETRY): # try: # response = co.chat( # message=prompt, # model=model, # temperature=temperature, # max_tokens=max_tokens, # chat_history=history, # ) # output = response.text # break # except cohere.core.api_error.ApiError as e: # print(type(e), e) # raise # except Exception as e: # print(type(e), e) # break # return output def reorg_answer_file(answer_file): """Sort by question id and de-duplication""" answers = {} with open(answer_file, "r") as fin: for l in fin: qid = json.loads(l)["question_id"] answers[qid] = l qids = sorted(list(answers.keys())) with open(answer_file, "w") as fout: for qid in qids: fout.write(answers[qid])