File size: 4,939 Bytes
14e4843
 
4bbb615
 
14e4843
 
 
 
4bbb615
 
 
 
 
 
 
034968f
4bbb615
 
 
e6c97c0
 
4bbb615
 
 
 
 
 
fe8e6f7
 
4bbb615
 
 
 
 
 
 
 
 
 
e6c97c0
 
4bbb615
 
 
 
 
 
 
 
 
 
e6c97c0
 
4bbb615
 
 
 
 
 
 
 
 
14e4843
 
 
 
a549d9d
d6d7ec6
 
 
 
 
 
 
 
 
 
 
14e4843
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c22d8d
d6d7ec6
 
 
14e4843
d6d7ec6
85e30d4
d6d7ec6
 
 
 
 
 
 
 
 
 
a4a186c
d6d7ec6
14e4843
 
 
 
1c22d8d
14e4843
 
d6d7ec6
 
14e4843
 
d6d7ec6
14e4843
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from lm_eval import evaluator
from lm_eval.tasks import TaskManager
from lm_eval.api.metrics import mean
from lm_eval.api.task import ConfigurableTask

from src.backend.manage_requests import EvalRequest


orig_process_results = ConfigurableTask.process_results
orig_aggregation = ConfigurableTask.aggregation
orig_higher_is_better = ConfigurableTask.higher_is_better

def process_results_decorator(func):
    def wrapper(self, doc, results, *args, **kwargs):
        processed_results = [r[0] for r in results]

        end_to_end_time = sum([r[1] for r in results]) / len(results)
        prefilling_time = sum([r[2] for r in results]) / len(results)
        decoding_throughput = sum([r[3] for r in results]) / len(results)
        mfu = sum([r[4] for r in results]) / len(results)
        mbu = sum([r[5] for r in results]) / len(results)
        # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")

        result_dict = func(self, doc, processed_results, *args, **kwargs)
        result_dict["end_to_end_time"] = end_to_end_time
        result_dict["prefilling_time"] = prefilling_time
        result_dict["decoding_throughput"] = decoding_throughput
        result_dict["mfu"] = mfu
        result_dict["mbu"] = mbu
        return result_dict
    return wrapper
ConfigurableTask.process_results = process_results_decorator(orig_process_results)

def aggregation_decorator(func):
    def wrapper(self, *args, **kwargs):
        aggregation_list = func(self, *args, **kwargs)
        aggregation_list["end_to_end_time"] = mean
        aggregation_list["prefilling_time"] = mean
        aggregation_list["decoding_throughput"] = mean
        aggregation_list["mfu"] = mean
        aggregation_list["mbu"] = mean
        return aggregation_list
    return wrapper
ConfigurableTask.aggregation = aggregation_decorator(orig_aggregation)

def higher_is_better_decorator(func):
    def wrapper(self, *args, **kwargs):
        higher_is_better_dict = func(self, *args, **kwargs)
        higher_is_better_dict["end_to_end_time"] = False
        higher_is_better_dict["prefilling_time"] = False
        higher_is_better_dict["decoding_throughput"] = True
        higher_is_better_dict["mfu"] = True
        higher_is_better_dict["mbu"] = True
        return higher_is_better_dict
    return wrapper
ConfigurableTask.higher_is_better = higher_is_better_decorator(orig_higher_is_better)

# from src.backend.tasks.xsum.task import XSum
# from src.backend.tasks.xsum.task_v2 import XSumv2

# from src.backend.tasks.cnndm.task import CNNDM
# from src.backend.tasks.cnndm.task_v2 import CNNDMv2

from src.backend.tasks.selfcheckgpt.task import SelfCheckGPT

from src.backend.huggingface_generate_until import HFLMwithChatTemplate
from src.backend.moe_infinity import MoEHFLM

def run_evaluation(
    eval_request: EvalRequest,
    task_names,
    num_fewshot,
    batch_size,
    device,
    use_cache=None,
    limit=None,
    max_nb_samples=100,
) -> dict:
    if limit:
        print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")

    # include_task_folder("src/backend/tasks/")
    # initialize_tasks('INFO')

    print(f"Allocating task manager for: {task_names}")

    task_manager = TaskManager(include_path="./src/backend/tasks/")
    # task_manager.initialize_tasks('INFO')

    print(f"Considered Tasks: {task_names}")
    # print(f"Allowed Tasks: {tasks.ALL_TASKS}")

    # task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)

    print(f"Selected Tasks: {task_names}")
    print(f"Eval Request: {eval_request}")
    print(
        f"Num Fewshot: {num_fewshot}, Batch Size: {batch_size}, Device: {device}, Use Cache: {use_cache}, Limit: {limit}"
    )
    # hf-chat is implemented to use apply_chat_template
    results = evaluator.simple_evaluate(
        model=eval_request.inference_framework,  # "hf-chat", "moe-infinity"
        model_args=eval_request.get_model_args(),
        tasks=task_names,
        num_fewshot=num_fewshot,
        batch_size=batch_size,
        max_batch_size=8,
        device=device,
        use_cache=use_cache,
        limit=limit,
        write_out=True,
        task_manager=task_manager,
        verbosity="WARNING",
    )

    results["config"]["model_dtype"] = eval_request.precision
    results["config"]["model_name"] = eval_request.model
    results["config"]["model_sha"] = eval_request.revision
    results["config"]["inference_framework"] = eval_request.inference_framework

    if max_nb_samples is not None:
        if "samples" in results:
            samples = results["samples"]
            for task_name in samples.keys():
                if len(samples[task_name]) > max_nb_samples:
                    results["samples"][task_name] = results["samples"][task_name][:max_nb_samples]

    # print(evaluator.make_table(results))

    return results