Spaces:

sparse-generative-ai
/

open-moe-llm-leaderboard

Running

App Files Files Community

future-xy commited on Apr 4

Commit

b9f0099

•

1 Parent(s): 9ceb74b

add system performance metrics

Browse files

Files changed (3) hide show

src/backend/hflm_with_measurement.py +2 -1
src/backend/tasks/measurement_task_utils.py +52 -0
src/backend/tasks/selfcheckgpt/task.py +3 -0

src/backend/hflm_with_measurement.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import copy
 import os
 from datetime import timedelta
 from pathlib import Path
 from typing import List, Literal, Optional, Tuple, Union
@@ -195,7 +196,7 @@ class HFLMWithMeasurement(HFLM):
                         # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
                         s = s.split(term)[0]
-                res.append(s)
                 self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
                 pbar.update(1)

 import copy
 import os
 from datetime import timedelta
+import random
 from pathlib import Path
 from typing import List, Literal, Optional, Tuple, Union
                         # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
                         s = s.split(term)[0]
+                res.append((s, random.random()))
                 self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
                 pbar.update(1)

src/backend/tasks/measurement_task_utils.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import functools
+from lm_eval.api.metrics import mean
+def process_results_decorator(func):
+    # This decorator processes the results of a task before passing them to the original process_results function
+    @functools.wraps(func)
+    def wrapper(self, doc, results, *args, **kwargs):
+        # We process the results here
+        processed_results = [r[0] for r in results]
+        latency = sum([r[1] for r in results]) / len(results)
+        print(f"Average latency: {latency}")
+        # Now call the original process_results with the processed results
+        result_dict = func(self, doc, processed_results, *args, **kwargs)
+        result_dict["latency"] = latency
+        return result_dict
+    return wrapper
+def aggregation_decorator(func):
+    @functools.wraps(func)
+    def wrapper(self, *args, **kwargs):
+        aggregation_list = func(self, *args, **kwargs)
+        aggregation_list["latency"] = mean
+        return aggregation_list
+    return wrapper
+def higher_is_better_decorator(func):
+    @functools.wraps(func)
+    def wrapper(self, *args, **kwargs):
+        higher_is_better_dict = func(self, *args, **kwargs)
+        higher_is_better_dict["latency"] = False
+        return higher_is_better_dict
+    return wrapper
+def measure_system_metrics(cls):
+    method_decorators = {
+        'process_results': [process_results_decorator],
+        'aggregation': [aggregation_decorator],
+        'higher_is_better': [higher_is_better_decorator],
+    }
+    for method_name, decorators in method_decorators.items():
+        if callable(getattr(cls, method_name, None)):
+            original_method = getattr(cls, method_name)
+            for decorator in reversed(decorators):
+                original_method = decorator(original_method)
+            setattr(cls, method_name, original_method)
+    return cls

src/backend/tasks/selfcheckgpt/task.py CHANGED Viewed

@@ -12,8 +12,11 @@ from src.backend.envs import DEVICE
 import spacy
 from selfcheckgpt.modeling_selfcheck import SelfCheckMQAG, SelfCheckNLI, SelfCheckBERTScore, SelfCheckNgram
 # @register_task("selfcheckgpt")
 class SelfCheckGPT(ConfigurableTask):
     VERSION = 0.0
     DATASET_PATH = "potsawee/wiki_bio_gpt3_hallucination"

 import spacy
 from selfcheckgpt.modeling_selfcheck import SelfCheckMQAG, SelfCheckNLI, SelfCheckBERTScore, SelfCheckNgram
+from src.backend.tasks.measurement_task_utils import measure_system_metrics
 # @register_task("selfcheckgpt")
+@measure_system_metrics
 class SelfCheckGPT(ConfigurableTask):
     VERSION = 0.0
     DATASET_PATH = "potsawee/wiki_bio_gpt3_hallucination"