Spaces:

ahmedheakl
/

CAMEL-Bench-leaderboard

Running

File size: 2,993 Bytes

8460af1
b0ee7b4
 
 
aa9bb5e
6e6ac6c
 
aa9bb5e
 
6e6ac6c
aa9bb5e
 
6e6ac6c
aa9bb5e
 
6e6ac6c
aa9bb5e
 
6e6ac6c
aa9bb5e
 
6e6ac6c
aa9bb5e
 
6e6ac6c
aa9bb5e
 
6e6ac6c
aa9bb5e
 
6e6ac6c
aa9bb5e
b0ee7b4
 
 
cb50a24
7337644
aae65ae
f2e3361
b0ee7b4
 
 
20f0a61
 
 
 
 
6d0114c
 
e9c1ec6
 
 
 
6d0114c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0ee7b4
f2e3361

import pandas as pd
import gradio as gr

data = {
    "Method": [
        "GPT-4o", "GPT-4o-mini", "Qwen2-VL-7B", "Gemini-1.5-Pro", "Gemini-1.5-Flash", 
        "LLaVa-OneVision-7B", "Pangea-7B-Instruct", "Qwen2-VL-2B", "InternVL2-8B", "LLaVa-NeXt-7B"
    ],
    "MM Understanding & Reasoning": [
        57.90, 48.82, 51.35, 46.67, 45.58, 42.90, 40.09, 40.59, 30.41, 26.33
    ],
    "OCR & Document Understanding": [
        59.11, 42.89, 49.06, 36.59, 33.59, 31.35, 17.75, 25.68, 15.91, 19.12
    ],
    "Charts & Diagram Understanding": [
        73.57, 64.98, 55.39, 47.06, 48.25, 40.86, 38.75, 27.83, 30.27, 27.56
    ],
    "Video Understanding": [
        74.27, 68.11, 62.64, 42.94, 53.31, 29.41, 49.01, 38.90, 51.42, 44.90
    ],
    "Cultural Specific Understanding": [
        80.86, 65.92, 75.64, 56.24, 46.54, 66.02, 20.34, 34.27, 20.88, 28.30
    ],
    "Medical Imaging": [
        49.90, 47.37, 39.42, 33.77, 42.86, 27.29, 31.99, 29.12, 29.48, 22.54
    ],
    "Agro Specific": [
        80.75, 79.58, 79.84, 72.12, 76.06, 75.03, 74.51, 52.02, 44.47, 42.00
    ],
    "Remote Sensing Understanding": [
        22.85, 16.93, 22.28, 17.07, 14.95, 10.72, 6.67, 12.56, 5.36, 8.33
    ]
}

df = pd.DataFrame(data)
df['Average Score'] = df.iloc[:, 1:].mean(axis=1).round(2)
df = df[['Method', 'Average Score'] + [col for col in df.columns if col not in ['Method', 'Average Score']]]

def display_data():
    return df

with gr.Blocks() as demo:
    gr.Markdown("![camel icon](https://cdn-uploads.huggingface.co/production/uploads/656864e12d73834278a8dea7/n-XfVKd1xVywH_vgPyJyQ.png)", elem_id="camel-icon")  # Replace with actual camel icon URL
    gr.Markdown("# **CAMEL-Bench: Model Performance Across Vision Understanding Tasks**")
    gr.Markdown("""
    This table shows the performance of different models across various tasks including OCR, chart understanding, video, medical imaging, and more. 
    """)
    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("🏅 LLM Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
            # with gr.Row():
                # with gr.Column():
            gr.Dataframe(value=df, label="CAMEL-Bench Model Performance", interactive=False)

        with gr.TabItem("📤 How to Submit", elem_id="submission-tab", id=1):
            gr.Markdown("""
            ## Submission Instructions

            To contribute your model's results to the CAMEL-Bench leaderboard:

            - **Via GitHub Pull Request**: 
              - Use [this evaluation script](https://github.com/mbzuai-oryx/Camel-Bench/blob/main/scripts/eval_qwen.py) to test your model and generate results.
              - Create a pull request in the CAMEL-Bench GitHub repository with your results.

            - **Via Email**:
              - Send your results to **[email protected]**, and we’ll add them to the leaderboard for you.

            **We look forward to seeing your contributions!**
            """)

demo.launch()