File size: 2,993 Bytes
8460af1 b0ee7b4 aa9bb5e 6e6ac6c aa9bb5e 6e6ac6c aa9bb5e 6e6ac6c aa9bb5e 6e6ac6c aa9bb5e 6e6ac6c aa9bb5e 6e6ac6c aa9bb5e 6e6ac6c aa9bb5e 6e6ac6c aa9bb5e 6e6ac6c aa9bb5e b0ee7b4 cb50a24 7337644 aae65ae f2e3361 b0ee7b4 20f0a61 6d0114c e9c1ec6 6d0114c b0ee7b4 f2e3361 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import pandas as pd
import gradio as gr
data = {
"Method": [
"GPT-4o", "GPT-4o-mini", "Qwen2-VL-7B", "Gemini-1.5-Pro", "Gemini-1.5-Flash",
"LLaVa-OneVision-7B", "Pangea-7B-Instruct", "Qwen2-VL-2B", "InternVL2-8B", "LLaVa-NeXt-7B"
],
"MM Understanding & Reasoning": [
57.90, 48.82, 51.35, 46.67, 45.58, 42.90, 40.09, 40.59, 30.41, 26.33
],
"OCR & Document Understanding": [
59.11, 42.89, 49.06, 36.59, 33.59, 31.35, 17.75, 25.68, 15.91, 19.12
],
"Charts & Diagram Understanding": [
73.57, 64.98, 55.39, 47.06, 48.25, 40.86, 38.75, 27.83, 30.27, 27.56
],
"Video Understanding": [
74.27, 68.11, 62.64, 42.94, 53.31, 29.41, 49.01, 38.90, 51.42, 44.90
],
"Cultural Specific Understanding": [
80.86, 65.92, 75.64, 56.24, 46.54, 66.02, 20.34, 34.27, 20.88, 28.30
],
"Medical Imaging": [
49.90, 47.37, 39.42, 33.77, 42.86, 27.29, 31.99, 29.12, 29.48, 22.54
],
"Agro Specific": [
80.75, 79.58, 79.84, 72.12, 76.06, 75.03, 74.51, 52.02, 44.47, 42.00
],
"Remote Sensing Understanding": [
22.85, 16.93, 22.28, 17.07, 14.95, 10.72, 6.67, 12.56, 5.36, 8.33
]
}
df = pd.DataFrame(data)
df['Average Score'] = df.iloc[:, 1:].mean(axis=1).round(2)
df = df[['Method', 'Average Score'] + [col for col in df.columns if col not in ['Method', 'Average Score']]]
def display_data():
return df
with gr.Blocks() as demo:
gr.Markdown("![camel icon](https://cdn-uploads.huggingface.co/production/uploads/656864e12d73834278a8dea7/n-XfVKd1xVywH_vgPyJyQ.png)", elem_id="camel-icon") # Replace with actual camel icon URL
gr.Markdown("# **CAMEL-Bench: Model Performance Across Vision Understanding Tasks**")
gr.Markdown("""
This table shows the performance of different models across various tasks including OCR, chart understanding, video, medical imaging, and more.
""")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("🏅 LLM Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
# with gr.Row():
# with gr.Column():
gr.Dataframe(value=df, label="CAMEL-Bench Model Performance", interactive=False)
with gr.TabItem("📤 How to Submit", elem_id="submission-tab", id=1):
gr.Markdown("""
## Submission Instructions
To contribute your model's results to the CAMEL-Bench leaderboard:
- **Via GitHub Pull Request**:
- Use [this evaluation script](https://github.com/mbzuai-oryx/Camel-Bench/blob/main/scripts/eval_qwen.py) to test your model and generate results.
- Create a pull request in the CAMEL-Bench GitHub repository with your results.
- **Via Email**:
- Send your results to **[email protected]**, and we’ll add them to the leaderboard for you.
**We look forward to seeing your contributions!**
""")
demo.launch()
|