compassjudger_subj_eval_leaderboard

Running

App Files Files Community

linjunyao commited on 15 days ago

Commit

80c5be6

•

1 Parent(s): b7d59b5

updated dataset; changed tab and cell formats

Browse files

Files changed (10) hide show

app.py +6 -6
data/CJ-1-32B.csv +11 -0
data/CJ-1-7B.csv +11 -0
data/detail_a_cn.csv +0 -11
data/detail_a_en.csv +0 -11
data/detail_b_acc.csv +0 -9
data/detail_b_corr.csv +0 -9
data/overall.csv +0 -11
judgerbench/meta_data.py +8 -9
start_gradio_web_server.sh +1 -1

app.py CHANGED Viewed

@@ -65,10 +65,10 @@ with gr.Blocks() as demo:
         for cur_id, (filename, filepath) in enumerate(LEADERBOARD_FILE_MAPPING.items()):
             tab_name = filename
-            if filename == "overall":
-                tab_name = '🏅 JudgerBench Main Leaderboard'
-            with gr.Tab(tab_name, elem_id=f'tab_{cur_id}', id=cur_id):
                 # gr.Markdown(LEADERBOARD_MD['MAIN'])
                 # _, check_box = build_l1_df(MAIN_FIELDS)
@@ -128,10 +128,10 @@ with gr.Blocks() as demo:
                     table_styler = (
                         table.style.apply(cell_styler, axis=None)
-                        .format(precision=3)
                     )
                 else:
-                    table_styler = table.style.format(prevision=3)
                 # with gr.Row():
                 #     model_size = gr.CheckboxGroup(
@@ -291,7 +291,7 @@ if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument("--host", type=str, default="0.0.0.0")
-    parser.add_argument("--port", type=int)
     parser.add_argument(
         "--share",
         action="store_true",

         for cur_id, (filename, filepath) in enumerate(LEADERBOARD_FILE_MAPPING.items()):
             tab_name = filename
+            # if filename == "overall":
+            #     tab_name = '🏅 JudgerBench Main Leaderboard'
+            with gr.Tab(tab_name.upper(), elem_id=f'tab_{cur_id}', id=cur_id):
                 # gr.Markdown(LEADERBOARD_MD['MAIN'])
                 # _, check_box = build_l1_df(MAIN_FIELDS)
                     table_styler = (
                         table.style.apply(cell_styler, axis=None)
+                        .format(precision=1)
                     )
                 else:
+                    table_styler = table.style.format(precision=1)
                 # with gr.Row():
                 #     model_size = gr.CheckboxGroup(
     parser = argparse.ArgumentParser()
     parser.add_argument("--host", type=str, default="0.0.0.0")
+    parser.add_argument("--port", type=int, default="7860")
     parser.add_argument(
         "--share",
         action="store_true",

data/CJ-1-32B.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+Models,Alignbench,ArenaHard,Fofo_en,Fofo_cn,Wildbench,Average
+qwen2.5-72b-instruct-turbomind,70,84.28,64,70,39.71333333,65.59866667
+qwen2.5-32b-instruct-turbomind,68.8,78.02,57,64,23.25,58.214
+qwen2.5-14b-instruct-turbomind,68,71.25,53,56,22.71666667,54.19333333
+qwen2.5-7b-instruct-turbomind,61.8,57.36,48,46,16.77333333,45.98666667
+llama-3_1-70b-instruct-turbomind,54.6,60.71,55,44,12.50333333,45.36266667
+internlm2_5-20b-chat-turbomind,63,31.62,41,53,5.703333333,38.86466667
+llama-3_1-8b-instruct-turbomind,47.7,33.88,41,34,-3.286666667,30.65866667
+internlm2_5-7b-chat-turbomind,57.5,17.16,38,45,-13.33,28.866
+qwen2.5-3b-instruct-turbomind,55.6,31.44,31,30,-8.953333333,27.81733333
+qwen2.5-1.5b-instruct-turbomind,46.8,11.37,13,14,-54.71,6.092

data/CJ-1-7B.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+Models,Alignbench,ArenaHard,Fofo_en,Fofo_cn,Wildbench,Average
+qwen2.5-72b-instruct-turbomind,67.2,81.83,50,58,35.74333333,58.55466667
+qwen2.5-32b-instruct-turbomind,65.4,71.63,45,55,19.06,51.218
+qwen2.5-14b-instruct-turbomind,64.8,71.71,44,48,19.62666667,49.62733333
+qwen2.5-7b-instruct-turbomind,60.8,60.93,40,46,18.57333333,45.26066667
+llama-3_1-70b-instruct-turbomind,53.2,59.24,45,38,9.636666667,41.01533333
+internlm2_5-20b-chat-turbomind,60.5,33.58,41,49,5.103333333,37.83666667
+llama-3_1-8b-instruct-turbomind,47.2,38.76,36,31,-1.173333333,30.35733333
+qwen2.5-3b-instruct-turbomind,55.2,38.12,30,30,-3.023333333,30.05933333
+internlm2_5-7b-chat-turbomind,57.2,17.9,34,43,-12.95333333,27.82933333
+qwen2.5-1.5b-instruct-turbomind,47,14.58,16,14,-46.41,9.034

data/detail_a_cn.csv DELETED Viewed

@@ -1,11 +0,0 @@
-Models,Teaser,AI,Roleplay,Chat,Math,Reasoning,Creation,Code,Science,Humanities,Average,Class
-CJ-1-14B,0.69,0.61,0.51,0.55,0.71,0.68,0.6,0.58,0.61,0.65,0.619,Judge
-GPT-4o-0806,0.77,0.56,0.51,0.53,0.67,0.66,0.63,0.58,0.62,0.58,0.611,API
-CJ-1-32B,0.69,0.58,0.53,0.52,0.71,0.53,0.6,0.61,0.61,0.69,0.607,Judge
-Skywork-llama3.1-8B,0.62,0.58,0.58,0.59,0.63,0.58,0.6,0.61,0.6,0.61,0.6,Judge
-Qwen2.5-72B-Chat,0.65,0.47,0.49,0.47,0.71,0.6,0.57,0.58,0.69,0.6,0.583,General
-CJ-1-7B,0.62,0.54,0.41,0.58,0.7,0.6,0.59,0.56,0.59,0.6,0.579,Judge
-Qwen2-72B-Chat,0.62,0.54,0.34,0.55,0.68,0.63,0.58,0.58,0.62,0.64,0.578,General
-Selftaught-llama3.1-70B,0.62,0.56,0.55,0.48,0.67,0.55,0.57,0.57,0.51,0.61,0.569,Judge
-Qwen2.5-7B-Chat,0.46,0.58,0.36,0.45,0.7,0.53,0.52,0.53,0.52,0.64,0.529,General
-CJ-1-1.5B,0.54,0.58,0.38,0.38,0.62,0.63,0.54,0.52,0.55,0.54,0.528,Judge

data/detail_a_en.csv DELETED Viewed

@@ -1,11 +0,0 @@
-Models,Teaser,AI,Roleplay,Chat,Math,Reasoning,Creation,Code,Science,Humanities,Average,Class
-GPT-4o-0806,0.82,0.53,0.62,0.61,0.83,0.67,0.67,0.73,0.64,0.55,0.667,API
-Skywork-llama3.1-8B,0.69,0.61,0.54,0.62,0.63,0.64,0.6,0.69,0.74,0.53,0.629,Judge
-Qwen2.5-72B-Chat,0.68,0.57,0.57,0.47,0.78,0.64,0.58,0.75,0.61,0.52,0.617,General
-CJ-1-32B,0.66,0.57,0.56,0.59,0.78,0.58,0.55,0.75,0.6,0.49,0.613,Judge
-CJ-1-14B,0.66,0.51,0.57,0.54,0.72,0.61,0.56,0.74,0.61,0.47,0.599,Judge
-Qwen2-72B-Chat,0.63,0.59,0.54,0.49,0.62,0.64,0.6,0.74,0.51,0.52,0.588,General
-CJ-1-7B,0.56,0.56,0.51,0.47,0.68,0.58,0.58,0.75,0.58,0.43,0.57,Judge
-Qwen2.5-7B-Chat,0.54,0.59,0.59,0.46,0.69,0.43,0.61,0.65,0.58,0.52,0.566,General
-CJ-1-1.5B,0.42,0.56,0.56,0.43,0.66,0.47,0.55,0.78,0.64,0.44,0.551,Judge
-Selftaught-llama3.1-70B,0.47,0.45,0.47,0.37,0.45,0.43,0.36,0.58,0.48,0.36,0.442,Judge

data/detail_b_acc.csv DELETED Viewed

@@ -1,9 +0,0 @@
-Models,AlignBench,Fofo,WildBench,ArenaHard,Average,Class
-CJ-1-32B,0.857,0.806,0.596,0.621,0.72,Judge
-CJ-1-14B,0.839,0.787,0.566,0.602,0.699,Judge
-CJ-1-7B,0.816,0.783,0.564,0.586,0.687,Judge
-Qwen2.5-72B-Chat,0.878,0.677,0.599,0.57,0.681,General
-CJ-1-1.5B,0.822,0.712,0.55,0.43,0.629,Judge
-Qwen2-72B-Chat,0.867,0.692,0.564,0.376,0.625,General
-Selftaught-llama3.1-70B,0.755,0.627,0.538,0.472,0.598,Judge
-Qwen2.5-7B-Chat,0.777,0.67,0.47,0.444,0.59,General

data/detail_b_corr.csv DELETED Viewed

@@ -1,9 +0,0 @@
-Models,AlignBench,Fofo,WildBench,ArenaHard,Average,Class
-CJ-1-32B,0.973,0.951,0.954,0.975,0.963,Judge
-CJ-1-14B,0.966,0.956,0.965,0.951,0.959,Judge
-CJ-1-7B,0.956,0.936,0.97,0.932,0.948,Judge
-Qwen2.5-72B-Chat,0.964,0.916,0.958,0.912,0.937,General
-Qwen2-72B-Chat,0.937,0.889,0.976,0.936,0.935,General
-CJ-1-1.5B,0.928,0.851,0.981,0.858,0.905,Judge
-Qwen2.5-7B-Chat,0.916,0.681,0.967,0.931,0.874,General
-Selftaught-llama3.1-70B,0.918,0.667,0.95,0.942,0.869,Judge

data/overall.csv DELETED Viewed

@@ -1,11 +0,0 @@
-Models,JDB-A EN,JDB-A CN,JDB-B Acc,JDB-B Corr,JudgerBench,Class
-GPT-4o-0806,0.664,0.608,1,1,0.818,API
-CJ-1-32B,0.614,0.612,0.72,0.963,0.727,Judge
-CJ-1-14B,0.599,0.615,0.699,0.959,0.718,Judge
-Qwen2.5-72B-Chat,0.615,0.59,0.681,0.937,0.706,General
-CJ-1-7B,0.57,0.583,0.687,0.948,0.697,Judge
-Qwen2-72B-Chat,0.588,0.584,0.625,0.935,0.683,General
-CJ-1-1.5B,0.553,0.527,0.629,0.905,0.654,Judge
-Qwen2.5-7B-Chat,0.567,0.535,0.59,0.874,0.641,General
-Selftaught-llama3.1-70B,0.443,0.57,0.598,0.869,0.62,Judge
-Skywork-llama3.1-8B,0.63,0.605,-,-,-,Judge

judgerbench/meta_data.py CHANGED Viewed

@@ -6,13 +6,10 @@ import os
 REPO_PATH = Path(os.path.dirname(os.path.dirname(__file__)))
 DATADIR = REPO_PATH / Path(os.getenv("DATADIR", "./data"))
-LEADERBOARD_FILE_MAPPING = dict(
-    overall="overall.csv",
-    detail_a_cn="detail_a_cn.csv",
-    detail_a_en="detail_a_en.csv",
-    detail_b_acc="detail_b_acc.csv",
-    detail_b_corr="detail_b_corr.csv",
-)
 STYLE_CLASS_MAPPING = {
     "API": '#82e0aa',
@@ -33,11 +30,13 @@ CITATION_BUTTON_TEXT = r"""@misc{2023opencompass,
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 # CONSTANTS-TEXT
-LEADERBORAD_INTRODUCTION = """# JudgerBench Leaderboard
-### Welcome to the JudgerBench Leaderboard!
 This leaderboard was last updated: {}.
 """
 # CONSTANTS-FIELDS

 REPO_PATH = Path(os.path.dirname(os.path.dirname(__file__)))
 DATADIR = REPO_PATH / Path(os.getenv("DATADIR", "./data"))
+LEADERBOARD_FILE_MAPPING = {
+    "cj-1-7b": "CJ-1-7B.csv",
+    "cj-1-32b": "CJ-1-32B.csv",
+}
 STYLE_CLASS_MAPPING = {
     "API": '#82e0aa',
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 # CONSTANTS-TEXT
+LEADERBORAD_INTRODUCTION = """# CompassJudger Subjective Evaluation Learderboard
+### Welcome to the CompassJudger Subjective Evaluation Learderboard!
 This leaderboard was last updated: {}.
+*All results from the corresponding datasets have been normalized to percentages.
 """
 # CONSTANTS-FIELDS

start_gradio_web_server.sh CHANGED Viewed

@@ -2,7 +2,7 @@ conda activate judgerbench
 # GRADIO_HOSTNAME=0.0.0.0
 GRADIO_HOSTNAME=$(hostname)
-GRADIO_PORT=7861
 # Set tmp and logs folders for gradio
 export TMPDIR="tmp"

 # GRADIO_HOSTNAME=0.0.0.0
 GRADIO_HOSTNAME=$(hostname)
+GRADIO_PORT=7862
 # Set tmp and logs folders for gradio
 export TMPDIR="tmp"