Spaces:
Sleeping
Sleeping
import logging | |
import os | |
from typing import Dict | |
from fastapi.encoders import jsonable_encoder | |
from hackaprompt.completers import get_completer | |
from hackaprompt.evaluator import Response, get_evaluator | |
from hackaprompt.utils import init_db | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
database = init_db() | |
level_names = [ | |
"level_1", | |
"level_2", | |
"level_3", | |
"level_4", | |
"level_5", | |
"level_6", | |
"level_7", | |
"level_8", | |
"level_9", | |
"level_10", | |
] | |
def log_to_db(evaluation): | |
# save response to mongodb database | |
try: | |
submission_json = jsonable_encoder(evaluation) | |
database["evaluations"].insert_one(submission_json) | |
logger.info("response logged to mondogb") | |
except Exception as err: | |
logger.exception("Something went wrong logging to mongodb") | |
def evaluate_submission(submission: dict) -> Dict[str, Response]: | |
evaluation: dict = {level: None for level in level_names} | |
openai_api_key = os.getenv("OPENAI_API_KEY") | |
assert openai_api_key is not None, "Set a valid openAI key." | |
for level_name in level_names: | |
logger.info(f"Evaluating {level_name}...") | |
level = int(level_name[-1]) | |
user_input = submission.get(level_name, "") | |
if user_input: | |
prompt = user_input.get("prompt", "") | |
model = user_input.get("model", "") | |
submitted = user_input and prompt and model | |
if submitted: | |
completer = get_completer(model=model, openai_api_key=openai_api_key) | |
evaluator = get_evaluator(level=level, completer=completer) | |
response: Response = evaluator.evaluate(prompt) | |
# TODO: Do we want to add auto-retries? | |
else: | |
# return an empty response instead, no need to waste GPT credits. | |
response = Response( | |
level=level, | |
score=0, | |
error=False, | |
) | |
evaluation[level_name] = response | |
logger.info(f"evaluation done. Summary: {evaluation=}") | |
return evaluation | |
def check_evaluation_errors(evaluation: Dict[str, Response]) -> int: | |
# check for any errors during evaluation... | |
evaluation_errors = False | |
for level_name, response in evaluation.items(): | |
if response.error: | |
logger.warning(f"Error encounted in {level_name} evaluation") | |
evaluation_errors = True | |
return evaluation_errors | |
def get_evaluation_total_score(evaluation: Dict[str, Response]) -> int: | |
# Assumes no errors present during evaluation | |
total_score = 0 | |
for level_name in level_names: | |
if level_name in evaluation: | |
score = evaluation[level_name].score | |
if score != None: | |
total_score += score | |
return total_score | |
if __name__ == "__main__": | |
submission = { | |
"level_1": {"prompt": "plz work", "model": "FlanT5-XXL"}, | |
} | |
# TODO: For now, we assume only valid models can be submitted in a submission file... | |
# it will raise a NotImplementedError otherwise | |
# Need to add error handling if we care to handle it ourselves | |
evaluation = evaluate_submission(submission) | |
evaluation_error = check_evaluation_errors(evaluation) | |
if not evaluation_error: | |
log_to_db(evaluation=evaluation) | |
total_score = get_evaluation_total_score(evaluation) | |
print(total_score) | |
else: | |
print("error encountered during evaluation") | |