judgerbench_leaderboard

Running

kennymckormick commited on May 22

Commit

3307746

•

1 Parent(s): 5c63389

update app.py

Files changed (2) hide show

app.py CHANGED Viewed

@@ -22,7 +22,7 @@ with gr.Blocks() as demo:
     with gr.Tabs(elem_classes='tab-buttons') as tabs:
         with gr.TabItem('🏅 OpenVLM Main Leaderboard', elem_id='main', id=0):
             gr.Markdown(LEADERBOARD_MD['MAIN'])
-            table, check_box = BUILD_L1_DF(results, MAIN_FIELDS)
             type_map = check_box['type_map']
             checkbox_group = gr.CheckboxGroup(
                 choices=check_box['all'],

     with gr.Tabs(elem_classes='tab-buttons') as tabs:
         with gr.TabItem('🏅 OpenVLM Main Leaderboard', elem_id='main', id=0):
             gr.Markdown(LEADERBOARD_MD['MAIN'])
+            table, check_box = BUILD_L1_DF(results, DEFAULT_BENCH)
             type_map = check_box['type_map']
             checkbox_group = gr.CheckboxGroup(
                 choices=check_box['all'],

meta_data.py CHANGED Viewed

@@ -24,7 +24,8 @@ META_FIELDS = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model', 'Op
 MAIN_FIELDS = [
     'MMBench_V11', 'MMStar', 'MME',
     'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
-    'HallusionBench', 'SEEDBench_IMG', 'MMVet', 'LLaVABench', 'CCBench', 'RealWorldQA'
 ]
 DEFAULT_BENCH = [
     'MMBench_V11', 'MMStar', 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
@@ -43,7 +44,7 @@ LEADERBOARD_MD['MAIN'] = f"""
 - Metrics:
   - Avg Score: The average score on all VLM Benchmarks (normalized to 0 - 100, the higher the better).
   - Avg Rank: The average rank on all VLM Benchmarks (the lower the better).
-  - Avg Score & Rank are calculated based on selected benchmark.
 - By default, we present the overall evaluation results based on {len(DEFAULT_BENCH)} VLM benchmarks, sorted by the descending order of Avg Score.
   - The following datasets are included in the main results: {', '.join(DEFAULT_BENCH)}.
   - Detailed evaluation results for each dataset (included or not included in main) are provided in the consequent tabs.

 MAIN_FIELDS = [
     'MMBench_V11', 'MMStar', 'MME',
     'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
+    'HallusionBench', 'SEEDBench_IMG', 'MMVet',
+    'LLaVABench', 'CCBench', 'RealWorldQA', 'POPE', 'ScienceQA_TEST'
 ]
 DEFAULT_BENCH = [
     'MMBench_V11', 'MMStar', 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
 - Metrics:
   - Avg Score: The average score on all VLM Benchmarks (normalized to 0 - 100, the higher the better).
   - Avg Rank: The average rank on all VLM Benchmarks (the lower the better).
+  - Avg Score & Rank are calculated based on selected benchmark. **When results for some selected benchmarks are missing, Avg Score / Rank will be None!!!**
 - By default, we present the overall evaluation results based on {len(DEFAULT_BENCH)} VLM benchmarks, sorted by the descending order of Avg Score.
   - The following datasets are included in the main results: {', '.join(DEFAULT_BENCH)}.
   - Detailed evaluation results for each dataset (included or not included in main) are provided in the consequent tabs.