File size: 4,800 Bytes
14e4843
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d6d7ec6
 
 
 
 
 
14e4843
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d6d7ec6
14e4843
d6d7ec6
 
14e4843
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d6d7ec6
14e4843
 
 
 
 
 
 
 
 
 
 
 
 
d6d7ec6
 
 
14e4843
 
 
 
 
 
 
d6d7ec6
14e4843
 
 
d6d7ec6
14e4843
 
 
 
 
 
 
 
 
 
d6d7ec6
 
 
 
14e4843
 
 
 
 
 
 
 
 
d6d7ec6
14e4843
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env python

from huggingface_hub import snapshot_download

from src.backend.manage_requests import get_eval_requests
from src.backend.sort_queue import sort_models_by_priority
from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND

from src.backend.manage_requests import EvalRequest
from src.leaderboard.read_evals import EvalResult

from src.envs import QUEUE_REPO, RESULTS_REPO, API

import logging
import pprint

logging.getLogger("openai").setLevel(logging.WARNING)

logging.basicConfig(level=logging.ERROR)
pp = pprint.PrettyPrinter(width=80)

PENDING_STATUS = "PENDING"
RUNNING_STATUS = "RUNNING"
FINISHED_STATUS = "FINISHED"
FAILED_STATUS = "FAILED"

TASKS_HARNESS = [task.value for task in Tasks]

snapshot_download(
    repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60
)
snapshot_download(
    repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
)


def request_to_result_name(request: EvalRequest) -> str:
    org_and_model = request.model.split("/", 1)
    if len(org_and_model) == 1:
        model = org_and_model[0]
        res = f"{model}_{request.precision}"
    else:
        org = org_and_model[0]
        model = org_and_model[1]
        res = f"{org}_{model}_{request.precision}"
    return res


def process_finished_requests() -> bool:
    current_finished_status = [FINISHED_STATUS]

    if False:
        import os
        import dateutil

        model_result_filepaths = []
        results_path = f"{EVAL_RESULTS_PATH_BACKEND}/EleutherAI/gpt-neo-1.3B"
        requests_path = f"{EVAL_REQUESTS_PATH_BACKEND}/EleutherAI/gpt-neo-1.3B_eval_request_False_False_False.json"

        for root, _, files in os.walk(results_path):
            # We should only have json files in model results
            if len(files) == 0 or any([not f.endswith(".json") for f in files]):
                continue

            # Sort the files by date
            try:
                files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
            except dateutil.parser._parser.ParserError:
                files = [files[-1]]

            for file in files:
                model_result_filepaths.append(os.path.join(root, file))

        eval_results = {}
        for model_result_filepath in model_result_filepaths:
            # Creation of result
            eval_result = EvalResult.init_from_json_file(model_result_filepath)
            eval_result.update_with_request_file(requests_path)

            print("XXX", eval_result)

            # Store results of same eval together
            eval_name = eval_result.eval_name
            if eval_name in eval_results.keys():
                eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
            else:
                eval_results[eval_name] = eval_result

        print(eval_results)

        return True

    # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
    eval_requests: list[EvalRequest] = get_eval_requests(
        job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
    )
    # Sort the evals by priority (first submitted first run)
    eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)

    # XXX
    # eval_requests = [r for r in eval_requests if 'neo-1.3B' in r.model]

    import random

    random.shuffle(eval_requests)

    from src.leaderboard.read_evals import get_raw_eval_results

    eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND)

    result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
    result_name_to_result = {r.eval_name: r for r in eval_results}

    for eval_request in eval_requests:
        result_name: str = request_to_result_name(eval_request)

        # Check the corresponding result
        from typing import Optional

        eval_result: Optional[EvalResult] = (
            result_name_to_result[result_name] if result_name in result_name_to_result else None
        )

        # Iterate over tasks and, if we do not have results for a task, run the relevant evaluations
        for task in TASKS_HARNESS:
            task_name = task.benchmark

            if eval_result is None or task_name not in eval_result.results:
                eval_request: EvalRequest = result_name_to_request[result_name]

                # print(eval_result)
                print(result_name, "is incomplete -- missing task:", task_name, eval_result, eval_request.likes)


if __name__ == "__main__":
    res = process_finished_requests()