Spaces:

sparse-generative-ai
/

open-moe-llm-leaderboard

Running

App Files Files Community

open-moe-llm-leaderboard / src /backend /tasks /arena_hard /arena_utils.py

chivier

sync from github

fe8e6f7 6 months ago

raw

history blame contribute delete

9.82 kB

	'''
	This file is part of Open-MoE-LLM-Leaderboard and is modified based on work
	under the Apache 2.0 License from the arena-hard project.
	(https://github.com/lm-sys/arena-hard)
	Original Copyright (c) 2024 Tianle Li, Wei-Lin Chiang, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica
	See the NOTICE file distributed with this work for additional
	information regarding copyright ownership.
	'''


	import os
	import json
	import time
	import yaml
	import random

	from typing import Optional
	from glob import glob

	# API setting constants
	API_MAX_RETRY = 16
	API_RETRY_SLEEP = 10
	API_ERROR_OUTPUT = "$ERROR$"


	OPENAI_MODEL_LIST = (
	"gpt-3.5-turbo",
	"gpt-3.5-turbo-0301",
	"gpt-3.5-turbo-0613",
	"gpt-3.5-turbo-0613-verbose",
	"gpt-3.5-turbo-1106",
	"gpt-3.5-turbo-0125",
	"gpt-4",
	"gpt-4-0314",
	"gpt-4-0613",
	"gpt-4-turbo",
	"gpt-4-1106-preview",
	"gpt-4-0125-preview",
	)


	temperature_config = {
	"writing": 0.7,
	"roleplay": 0.7,
	"extraction": 0.0,
	"math": 0.0,
	"coding": 0.0,
	"reasoning": 0.0,
	"stem": 0.1,
	"humanities": 0.1,
	}


	def load_questions(question_file: str):
	"""Load questions from a file."""
	questions = []
	with open(question_file, "r") as ques_file:
	for line in ques_file:
	if line:
	questions.append(json.loads(line))
	return questions


	def load_model_answers(answer_dir: str):
	"""Load model answers.

	The return value is a python dict of type:
	Dict[model_name: str -> Dict[question_id: int -> answer: dict]]
	"""
	filenames = glob(os.path.join(answer_dir, "*.jsonl"))
	filenames.sort()
	model_answers = {}

	for filename in filenames:
	model_name = os.path.basename(filename)[:-6]
	answer = {}
	with open(filename) as fin:
	for line in fin:
	line = json.loads(line)
	answer[line["question_id"]] = line
	model_answers[model_name] = answer

	return model_answers


	def get_endpoint(endpoint_list):
	if endpoint_list is None:
	return None
	assert endpoint_list is not None
	# randomly pick one
	api_dict = random.choices(
	endpoint_list
	)[0]
	return api_dict


	# load config args from config yaml files
	def make_config(config_file: str) -> dict:
	config_kwargs = {}
	with open(config_file, "r") as f:
	config_kwargs = yaml.load(f, Loader=yaml.SafeLoader)

	return config_kwargs


	def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=None):
	import openai
	if api_dict:
	client = openai.OpenAI(
	base_url=api_dict["api_base"],
	api_key=api_dict["api_key"],
	)
	else:
	client = openai.OpenAI()

	output = API_ERROR_OUTPUT
	for _ in range(API_MAX_RETRY):
	try:
	# print(messages)
	completion = client.chat.completions.create(
	model=model,
	messages=messages,
	temperature=temperature,
	max_tokens=max_tokens
	)
	output = completion.choices[0].message.content
	break
	except openai.RateLimitError as e:
	print(type(e), e)
	time.sleep(API_RETRY_SLEEP)
	except openai.BadRequestError as e:
	print(messages)
	print(type(e), e)
	except KeyError:
	print(type(e), e)
	break

	return output


	# def chat_completion_openai_azure(model, messages, temperature, max_tokens, api_dict=None):
	# import openai
	# from openai import AzureOpenAI

	# api_base = api_dict["api_base"]
	# client = AzureOpenAI(
	# azure_endpoint = api_base,
	# api_key= api_dict["api_key"],
	# api_version=api_dict["api_version"],
	# timeout=240,
	# max_retries=2
	# )

	# output = API_ERROR_OUTPUT
	# for _ in range(API_MAX_RETRY):
	# try:
	# response = client.chat.completions.create(
	# model=model,
	# messages=messages,
	# n=1,
	# temperature=temperature,
	# max_tokens=max_tokens,
	# seed=42,
	# )
	# output = response.choices[0].message.content
	# break
	# except openai.RateLimitError as e:
	# print(type(e), e)
	# time.sleep(API_RETRY_SLEEP)
	# except openai.BadRequestError as e:
	# print(type(e), e)
	# break
	# except KeyError:
	# print(type(e), e)
	# break

	# return output


	# def chat_completion_anthropic(model, messages, temperature, max_tokens, api_dict=None):
	# import anthropic

	# if api_dict:
	# api_key = api_dict["api_key"]
	# else:
	# api_key = os.environ["ANTHROPIC_API_KEY"]

	# sys_msg = ""
	# if messages[0]["role"] == "system":
	# sys_msg = messages[0]["content"]
	# messages = messages[1:]

	# output = API_ERROR_OUTPUT
	# for _ in range(API_MAX_RETRY):
	# try:
	# # print(sys_msg)
	# c = anthropic.Anthropic(api_key=api_key)
	# response = c.messages.create(
	# model=model,
	# messages=messages,
	# stop_sequences=[anthropic.HUMAN_PROMPT],
	# max_tokens=max_tokens,
	# temperature=temperature,
	# system=sys_msg
	# )
	# output = response.content[0].text
	# break
	# except anthropic.APIError as e:
	# print(type(e), e)
	# time.sleep(API_RETRY_SLEEP)
	# return output


	# def chat_completion_mistral(model, messages, temperature, max_tokens):
	# from mistralai.client import MistralClient
	# from mistralai.models.chat_completion import ChatMessage
	# from mistralai.exceptions import MistralException

	# api_key = os.environ["MISTRAL_API_KEY"]
	# client = MistralClient(api_key=api_key)

	# prompts = [ChatMessage(role=message["role"], content=message["content"]) for message in messages]

	# output = API_ERROR_OUTPUT
	# for _ in range(API_MAX_RETRY):
	# try:
	# chat_response = client.chat(
	# model=model,
	# messages=prompts,
	# temperature=temperature,
	# max_tokens=max_tokens,
	# )
	# output = chat_response.choices[0].message.content
	# break
	# except MistralException as e:
	# print(type(e), e)
	# break

	# return output


	# def chat_completion_gemini(model, messages, temperature, max_tokens):
	# import google.generativeai as genai
	# genai.configure(api_key=os.environ["GEMINI_API_KEY"])

	# safety_settings = [
	# {
	# "category": "HARM_CATEGORY_HARASSMENT",
	# "threshold": "BLOCK_NONE"
	# },
	# {
	# "category": "HARM_CATEGORY_HATE_SPEECH",
	# "threshold": "BLOCK_NONE"
	# },
	# {
	# "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
	# "threshold": "BLOCK_NONE"
	# },
	# {
	# "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
	# "threshold": "BLOCK_NONE"
	# },
	# ]

	# # Set up the model
	# generation_config = {
	# "temperature": temperature,
	# "top_p": 1,
	# "top_k": 1,
	# "max_output_tokens": max_tokens,
	# }

	# output = API_ERROR_OUTPUT
	# for _ in range(API_MAX_RETRY):
	# try:
	# gemini = genai.GenerativeModel(
	# model_name=model,
	# generation_config=generation_config,
	# safety_settings=safety_settings)

	# convo = gemini.start_chat(history=[])

	# convo.send_message(messages)
	# output = convo.last.text
	# break
	# except genai.types.generation_types.StopCandidateException as e:
	# print(type(e), e)
	# break
	# except Exception as e:
	# print(type(e), e)
	# time.sleep(API_RETRY_SLEEP)

	# return output


	# def chat_completion_cohere(model, messages, temperature, max_tokens):
	# import cohere

	# co = cohere.Client(os.environ["COHERE_API_KEY"])
	# assert len(messages) > 0

	# template_map = {"system":"SYSTEM",
	# "assistant":"CHATBOT",
	# "user":"USER"}

	# assert messages[-1]["role"] == "user"
	# prompt = messages[-1]["content"]

	# if len(messages) > 1:
	# history = []
	# for message in messages[:-1]:
	# history.append({"role":template_map[message["role"]], "message":message["content"]})
	# else:
	# history = None

	# output = API_ERROR_OUTPUT
	# for _ in range(API_MAX_RETRY):
	# try:
	# response = co.chat(
	# message=prompt,
	# model=model,
	# temperature=temperature,
	# max_tokens=max_tokens,
	# chat_history=history,
	# )
	# output = response.text
	# break
	# except cohere.core.api_error.ApiError as e:
	# print(type(e), e)
	# raise
	# except Exception as e:
	# print(type(e), e)
	# break

	# return output


	def reorg_answer_file(answer_file):
	"""Sort by question id and de-duplication"""
	answers = {}
	with open(answer_file, "r") as fin:
	for l in fin:
	qid = json.loads(l)["question_id"]
	answers[qid] = l

	qids = sorted(list(answers.keys()))
	with open(answer_file, "w") as fout:
	for qid in qids:
	fout.write(answers[qid])