compassjudger_subj_eval_leaderboard

Running

App Files Files Community

kennymckormick commited on Mar 18

Commit

a6e43e6

•

1 Parent(s): b11357b

update

Browse files

Files changed (6) hide show

.pre-commit-config.yaml +33 -0
README.md +1 -1
app.py +37 -32
gen_table.py +146 -0
lb_info.py → meta_data.py +1 -136
requirements.txt +1 -1

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+exclude: |
+  (?x)^(
+      meta_data.py
+  )
+repos:
+  - repo: https://github.com/PyCQA/flake8
+    rev: 5.0.4
+    hooks:
+      - id: flake8
+        args: ["--max-line-length=120", "--ignore=F401,F403,F405,E402"]
+        exclude: ^configs/
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.11.5
+    hooks:
+      - id: isort
+  - repo: https://github.com/pre-commit/mirrors-yapf
+    rev: v0.30.0
+    hooks:
+      - id: yapf
+        args: ["--style={column_limit=120}"]
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v3.1.0
+    hooks:
+      - id: trailing-whitespace
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: requirements-txt-fixer
+      - id: double-quote-string-fixer
+      - id: check-merge-conflict
+      - id: fix-encoding-pragma
+        args: ["--remove"]
+      - id: mixed-line-ending
+        args: ["--fix=lf"]

README.md CHANGED Viewed

@@ -12,4 +12,4 @@ tags:
   - leaderboard
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

   - leaderboard
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import abc
 import gradio as gr
-from lb_info import *
 with gr.Blocks() as demo:
     struct = load_results()
@@ -24,30 +27,30 @@ with gr.Blocks() as demo:
             checkbox_group = gr.CheckboxGroup(
                 choices=check_box['all'],
                 value=check_box['required'],
-                label="Evaluation Dimension",
                 interactive=True,
             )
             headers = check_box['essential'] + checkbox_group.value
             with gr.Row():
                 model_size = gr.CheckboxGroup(
-                    choices=MODEL_SIZE,
-                    value=MODEL_SIZE,
                     label='Model Size',
                     interactive=True
                 )
                 model_type = gr.CheckboxGroup(
-                    choices=MODEL_TYPE,
-                    value=MODEL_TYPE,
                     label='Model Type',
                     interactive=True
                 )
             data_component = gr.components.DataFrame(
-                value=table[headers],
-                type="pandas",
                 datatype=[type_map[x] for x in headers],
-                interactive=False,
                 visible=True)
             def filter_df(fields, model_size, model_type):
                 headers = check_box['essential'] + fields
                 df = cp.deepcopy(table)
@@ -58,12 +61,12 @@ with gr.Blocks() as demo:
                     df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
                     df = df[df['flag']]
                     df.pop('flag')
                 comp = gr.components.DataFrame(
-                    value=df[headers],
-                    type="pandas",
                     datatype=[type_map[x] for x in headers],
-                    interactive=False,
                     visible=True)
                 return comp
@@ -84,31 +87,31 @@ with gr.Blocks() as demo:
                 s.checkbox_group = gr.CheckboxGroup(
                     choices=s.check_box['all'],
                     value=s.check_box['required'],
-                    label=f"{dataset} CheckBoxes",
                     interactive=True,
                 )
                 s.headers = s.check_box['essential'] + s.checkbox_group.value
                 with gr.Row():
                     s.model_size = gr.CheckboxGroup(
-                        choices=MODEL_SIZE,
-                        value=MODEL_SIZE,
                         label='Model Size',
                         interactive=True
                     )
                     s.model_type = gr.CheckboxGroup(
-                        choices=MODEL_TYPE,
-                        value=MODEL_TYPE,
                         label='Model Type',
                         interactive=True
                     )
                 s.data_component = gr.components.DataFrame(
-                    value=s.table[s.headers],
-                    type="pandas",
                     datatype=[s.type_map[x] for x in s.headers],
-                    interactive=False,
                     visible=True)
                 s.dataset = gr.Textbox(value=dataset, label=dataset, visible=False)
                 def filter_df_l2(dataset_name, fields, model_size, model_type):
                     s = structs[DATASETS.index(dataset_name)]
                     headers = s.check_box['essential'] + fields
@@ -120,25 +123,27 @@ with gr.Blocks() as demo:
                         df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
                         df = df[df['flag']]
                         df.pop('flag')
                     comp = gr.components.DataFrame(
-                        value=df[headers],
-                        type="pandas",
                         datatype=[s.type_map[x] for x in headers],
-                        interactive=False,
                         visible=True)
                     return comp
                 for cbox in [s.checkbox_group, s.model_size, s.model_type]:
-                    cbox.change(fn=filter_df_l2, inputs=[s.dataset, s.checkbox_group, s.model_size, s.model_type], outputs=s.data_component)
     with gr.Row():
-        with gr.Accordion("Citation", open=False):
             citation_button = gr.Textbox(
-                value=CITATION_BUTTON_TEXT,
                 label=CITATION_BUTTON_LABEL,
                 elem_id='citation-button')
 if __name__ == '__main__':
-    demo.launch(server_name='0.0.0.0')

 import abc
 import gradio as gr
+from gen_table import *
+from meta_data import *
 with gr.Blocks() as demo:
     struct = load_results()
             checkbox_group = gr.CheckboxGroup(
                 choices=check_box['all'],
                 value=check_box['required'],
+                label='Evaluation Dimension',
                 interactive=True,
             )
             headers = check_box['essential'] + checkbox_group.value
             with gr.Row():
                 model_size = gr.CheckboxGroup(
+                    choices=MODEL_SIZE,
+                    value=MODEL_SIZE,
                     label='Model Size',
                     interactive=True
                 )
                 model_type = gr.CheckboxGroup(
+                    choices=MODEL_TYPE,
+                    value=MODEL_TYPE,
                     label='Model Type',
                     interactive=True
                 )
             data_component = gr.components.DataFrame(
+                value=table[headers],
+                type='pandas',
                 datatype=[type_map[x] for x in headers],
+                interactive=False,
                 visible=True)
             def filter_df(fields, model_size, model_type):
                 headers = check_box['essential'] + fields
                 df = cp.deepcopy(table)
                     df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
                     df = df[df['flag']]
                     df.pop('flag')
                 comp = gr.components.DataFrame(
+                    value=df[headers],
+                    type='pandas',
                     datatype=[type_map[x] for x in headers],
+                    interactive=False,
                     visible=True)
                 return comp
                 s.checkbox_group = gr.CheckboxGroup(
                     choices=s.check_box['all'],
                     value=s.check_box['required'],
+                    label=f'{dataset} CheckBoxes',
                     interactive=True,
                 )
                 s.headers = s.check_box['essential'] + s.checkbox_group.value
                 with gr.Row():
                     s.model_size = gr.CheckboxGroup(
+                        choices=MODEL_SIZE,
+                        value=MODEL_SIZE,
                         label='Model Size',
                         interactive=True
                     )
                     s.model_type = gr.CheckboxGroup(
+                        choices=MODEL_TYPE,
+                        value=MODEL_TYPE,
                         label='Model Type',
                         interactive=True
                     )
                 s.data_component = gr.components.DataFrame(
+                    value=s.table[s.headers],
+                    type='pandas',
                     datatype=[s.type_map[x] for x in s.headers],
+                    interactive=False,
                     visible=True)
                 s.dataset = gr.Textbox(value=dataset, label=dataset, visible=False)
                 def filter_df_l2(dataset_name, fields, model_size, model_type):
                     s = structs[DATASETS.index(dataset_name)]
                     headers = s.check_box['essential'] + fields
                         df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
                         df = df[df['flag']]
                         df.pop('flag')
                     comp = gr.components.DataFrame(
+                        value=df[headers],
+                        type='pandas',
                         datatype=[s.type_map[x] for x in headers],
+                        interactive=False,
                         visible=True)
                     return comp
                 for cbox in [s.checkbox_group, s.model_size, s.model_type]:
+                    cbox.change(
+                        fn=filter_df_l2,
+                        inputs=[s.dataset, s.checkbox_group, s.model_size, s.model_type],
+                        outputs=s.data_component)
     with gr.Row():
+        with gr.Accordion('Citation', open=False):
             citation_button = gr.Textbox(
+                value=CITATION_BUTTON_TEXT,
                 label=CITATION_BUTTON_LABEL,
                 elem_id='citation-button')
 if __name__ == '__main__':
+    demo.launch(server_name='0.0.0.0')

gen_table.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import copy as cp
+import json
+from collections import defaultdict
+from urllib.request import urlopen
+import gradio as gr
+import numpy as np
+import pandas as pd
+from meta_data import META_FIELDS, URL
+def listinstr(lst, s):
+    assert isinstance(lst, list)
+    for item in lst:
+        if item in s:
+            return True
+    return False
+def load_results():
+    data = json.loads(urlopen(URL).read())
+    return data
+def nth_large(val, vals):
+    return sum([1 for v in vals if v > val]) + 1
+def format_timestamp(timestamp):
+    date = timestamp[:2] + '.' + timestamp[2:4] + '.' + timestamp[4:6]
+    time = timestamp[6:8] + ':' + timestamp[8:10] + ':' + timestamp[10:12]
+    return date + ' ' + time
+def model_size_flag(sz, FIELDS):
+    if pd.isna(sz) and 'Unknown' in FIELDS:
+        return True
+    if pd.isna(sz):
+        return False
+    if '<10B' in FIELDS and sz < 10:
+        return True
+    if '10B-20B' in FIELDS and sz >= 10 and sz < 20:
+        return True
+    if '20B-40B' in FIELDS and sz >= 20 and sz < 40:
+        return True
+    if '>40B' in FIELDS and sz >= 40:
+        return True
+    return False
+def model_type_flag(line, FIELDS):
+    if 'OpenSource' in FIELDS and line['OpenSource'] == 'Yes':
+        return True
+    if 'API' in FIELDS and line['OpenSource'] == 'No' and line['Verified'] == 'Yes':
+        return True
+    if 'Proprietary' in FIELDS and line['OpenSource'] == 'No' and line['Verified'] == 'No':
+        return True
+    return False
+def BUILD_L1_DF(results, fields):
+    res = defaultdict(list)
+    for i, m in enumerate(results):
+        item = results[m]
+        meta = item['META']
+        for k in META_FIELDS:
+            if k == 'Parameters (B)':
+                param = meta['Parameters']
+                res[k].append(float(param.replace('B', '')) if param != '' else None)
+            elif k == 'Method':
+                name, url = meta['Method']
+                res[k].append(f'<a href="{url}">{name}</a>')
+            else:
+                res[k].append(meta[k])
+        scores, ranks = [], []
+        for d in fields:
+            res[d].append(item[d]['Overall'])
+            if d == 'MME':
+                scores.append(item[d]['Overall'] / 28)
+            else:
+                scores.append(item[d]['Overall'])
+            ranks.append(nth_large(item[d]['Overall'], [x[d]['Overall'] for x in results.values()]))
+        res['Avg Score'].append(round(np.mean(scores), 1))
+        res['Avg Rank'].append(round(np.mean(ranks), 2))
+    df = pd.DataFrame(res)
+    df = df.sort_values('Avg Rank')
+    check_box = {}
+    check_box['essential'] = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model']
+    check_box['required'] = ['Avg Score', 'Avg Rank']
+    check_box['all'] = check_box['required'] + ['OpenSource', 'Verified'] + fields
+    type_map = defaultdict(lambda: 'number')
+    type_map['Method'] = 'html'
+    type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
+    check_box['type_map'] = type_map
+    return df, check_box
+def BUILD_L2_DF(results, dataset):
+    res = defaultdict(list)
+    fields = list(list(results.values())[0][dataset].keys())
+    non_overall_fields = [x for x in fields if 'Overall' not in x]
+    overall_fields = [x for x in fields if 'Overall' in x]
+    if dataset == 'MME':
+        non_overall_fields = [x for x in non_overall_fields if not listinstr(['Perception', 'Cognition'], x)]
+        overall_fields = overall_fields + ['Perception', 'Cognition']
+    for m in results:
+        item = results[m]
+        meta = item['META']
+        for k in META_FIELDS:
+            if k == 'Parameters (B)':
+                param = meta['Parameters']
+                res[k].append(float(param.replace('B', '')) if param != '' else None)
+            elif k == 'Method':
+                name, url = meta['Method']
+                res[k].append(f'<a href="{url}">{name}</a>')
+            else:
+                res[k].append(meta[k])
+        fields = [x for x in fields]
+        for d in non_overall_fields:
+            res[d].append(item[dataset][d])
+        for d in overall_fields:
+            res[d].append(item[dataset][d])
+    df = pd.DataFrame(res)
+    all_fields = overall_fields + non_overall_fields
+    # Use the first 5 non-overall fields as required fields
+    required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
+    if 'Overall' in overall_fields:
+        df = df.sort_values('Overall')
+        df = df.iloc[::-1]
+    check_box = {}
+    check_box['essential'] = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model']
+    check_box['required'] = required_fields
+    check_box['all'] = all_fields
+    type_map = defaultdict(lambda: 'number')
+    type_map['Method'] = 'html'
+    type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
+    check_box['type_map'] = type_map
+    return df, check_box

lb_info.py → meta_data.py RENAMED Viewed

@@ -1,17 +1,3 @@
-import json
-import pandas as pd
-from collections import defaultdict
-import gradio as gr
-import copy as cp
-import numpy as np
-def listinstr(lst, s):
-    assert isinstance(lst, list)
-    for item in lst:
-        if item in s:
-            return True
-    return False
 # CONSTANTS-URL
 URL = "http://opencompass.openxlab.space/utils/OpenVLM.json"
 VLMEVALKIT_README = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/README.md'
@@ -138,125 +124,4 @@ LEADERBOARD_MD['ScienceQA_VAL'] = """
 - During evaluation, we use `GPT-3.5-Turbo-0613` as the choice extractor for all VLMs if the choice can not be extracted via heuristic matching. **Zero-shot** inference is adopted.
 """
-LEADERBOARD_MD['ScienceQA_TEST'] = LEADERBOARD_MD['ScienceQA_VAL']
-from urllib.request import urlopen
-def load_results():
-    data = json.loads(urlopen(URL).read())
-    return data
-def nth_large(val, vals):
-    return sum([1 for v in vals if v > val]) + 1
-def format_timestamp(timestamp):
-    return timestamp[:2] + '.' + timestamp[2:4] + '.' + timestamp[4:6] + ' ' + timestamp[6:8] + ':' + timestamp[8:10] + ':' + timestamp[10:12]
-def model_size_flag(sz, FIELDS):
-    if pd.isna(sz) and 'Unknown' in FIELDS:
-        return True
-    if pd.isna(sz):
-        return False
-    if '<10B' in FIELDS and sz < 10:
-        return True
-    if '10B-20B' in FIELDS and sz >= 10 and sz < 20:
-        return True
-    if '20B-40B' in FIELDS and sz >= 20 and sz < 40:
-        return True
-    if '>40B' in FIELDS and sz >= 40:
-        return True
-    return False
-def model_type_flag(line, FIELDS):
-    if 'OpenSource' in FIELDS and line['OpenSource'] == 'Yes':
-        return True
-    if 'API' in FIELDS and line['OpenSource'] == 'No' and line['Verified'] == 'Yes':
-        return True
-    if 'Proprietary' in FIELDS and line['OpenSource'] == 'No' and line['Verified'] == 'No':
-        return True
-    return False
-def BUILD_L1_DF(results, fields):
-    res = defaultdict(list)
-    for i, m in enumerate(results):
-        item = results[m]
-        meta = item['META']
-        for k in META_FIELDS:
-            if k == 'Parameters (B)':
-                param = meta['Parameters']
-                res[k].append(float(param.replace('B', '')) if param != '' else None)
-            elif k == 'Method':
-                name, url = meta['Method']
-                res[k].append(f'<a href="{url}">{name}</a>')
-            else:
-                res[k].append(meta[k])
-        scores, ranks = [], []
-        for d in fields:
-            res[d].append(item[d]['Overall'])
-            if d == 'MME':
-                scores.append(item[d]['Overall'] / 28)
-            else:
-                scores.append(item[d]['Overall'])
-            ranks.append(nth_large(item[d]['Overall'], [x[d]['Overall'] for x in results.values()]))
-        res['Avg Score'].append(round(np.mean(scores), 1))
-        res['Avg Rank'].append(round(np.mean(ranks), 2))
-    df = pd.DataFrame(res)
-    df = df.sort_values('Avg Rank')
-    check_box = {}
-    check_box['essential'] = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model']
-    check_box['required'] = ['Avg Score', 'Avg Rank']
-    check_box['all'] = check_box['required'] + ['OpenSource', 'Verified'] + fields
-    type_map = defaultdict(lambda: 'number')
-    type_map['Method'] = 'html'
-    type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
-    check_box['type_map'] = type_map
-    return df, check_box
-def BUILD_L2_DF(results, dataset):
-    res = defaultdict(list)
-    fields = list(list(results.values())[0][dataset].keys())
-    non_overall_fields = [x for x in fields if 'Overall' not in x]
-    overall_fields = [x for x in fields if 'Overall' in x]
-    if dataset == 'MME':
-        non_overall_fields = [x for x in non_overall_fields if not listinstr(['Perception', 'Cognition'], x)]
-        overall_fields = overall_fields + ['Perception', 'Cognition']
-    for m in results:
-        item = results[m]
-        meta = item['META']
-        for k in META_FIELDS:
-            if k == 'Parameters (B)':
-                param = meta['Parameters']
-                res[k].append(float(param.replace('B', '')) if param != '' else None)
-            elif k == 'Method':
-                name, url = meta['Method']
-                res[k].append(f'<a href="{url}">{name}</a>')
-            else:
-                res[k].append(meta[k])
-        fields = [x for x in fields]
-        for d in non_overall_fields:
-            res[d].append(item[dataset][d])
-        for d in overall_fields:
-            res[d].append(item[dataset][d])
-    df = pd.DataFrame(res)
-    all_fields = overall_fields + non_overall_fields
-    # Use the first 5 non-overall fields as required fields
-    required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
-    if 'Overall' in overall_fields:
-        df = df.sort_values('Overall')
-        df = df.iloc[::-1]
-    check_box = {}
-    check_box['essential'] = ['Method', 'Parameters (B)', 'Language Model', 'Vision Model']
-    check_box['required'] = required_fields
-    check_box['all'] = all_fields
-    type_map = defaultdict(lambda: 'number')
-    type_map['Method'] = 'html'
-    type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
-    check_box['type_map'] = type_map
-    return df, check_box

 # CONSTANTS-URL
 URL = "http://opencompass.openxlab.space/utils/OpenVLM.json"
 VLMEVALKIT_README = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/README.md'
 - During evaluation, we use `GPT-3.5-Turbo-0613` as the choice extractor for all VLMs if the choice can not be extracted via heuristic matching. **Zero-shot** inference is adopted.
 """
+LEADERBOARD_MD['ScienceQA_TEST'] = LEADERBOARD_MD['ScienceQA_VAL']

requirements.txt CHANGED Viewed

@@ -1,3 +1,3 @@
 numpy>=1.23.4
 pandas>=1.5.3
-gradio==4.15.0

+gradio==4.15.0
 numpy>=1.23.4
 pandas>=1.5.3