from dataclasses import dataclass
from enum import Enum
@dataclass
class Task:
benchmark: str
metric: str
col_name: str
# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
task0 = Task("anli_r1", "acc", "ANLI")
task1 = Task("logiqa", "acc_norm", "LogiQA")
NUM_FEWSHOT = 0 # Change with your few shot
# ---------------------------------------------------
# Your leaderboard name
TITLE = """
TempCompass leaderboard
"""
# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
Welcome to the leaderboard of TempCompass! š
TempCompass is a benchmark to evaluate the temporal perception ability of Video LLMs. It consists of 410 videos and 7,540 task instructions, covering 11 temporal aspects and 4 task types. Please refer to [our paper](https://arxiv.org/abs/2403.00476) for more details.
"""
# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = f"""
## How it works
## Reproducibility
To reproduce our results, here is the commands you can run:
"""
EVALUATION_QUEUE_TEXT = """
# TempCompass Leaderboard
Welcome to the leaderboard of the Video-Bench! š
## Submit Instruction
Run inference and automatic evaluation according to our [github repository](https://github.com/llyx97/TempCompass?tab=readme-ov-file#-quick-start).
You will obtain the JSON file `.json`, where `` correspond to one of the four categories: `multi-choice`, `yes_no`, `caption_matching` and `captioning`. (Example files can be found [here](https://github.com/llyx97/TempCompass/tree/main/auto_eval_results/video-llava))
For `multi-choice`, `yes_no`, `caption_matching`, the evaluation result of each question contains five keys. A specific example is as follows:
```python
{
"question": "What activity is the monkey engaged in?\\nA. swimming\\nB. running\\nC. climbing\\nD. fighting",
"gt-answer": "D. fighting",
"video-llm-prediction": "D",
"match_success": true, # whether the video-llm-prediction can be assessed by rule-based matching
"rating": 1
}
```
For `captioning`, we prompt chatgpt to answer the multi-choice question, using the Video LLM generated caption as context. An example of evalution result is as follows:
```python
{
"chatgpt-reasoning": "The video description specifically mentions that the man is dribbling a basketball, dunking a basketball, and passing a basketball.",
"chatgpt-answer": "B. dribbling a basketball, C. passing a basketball",
"video-llm-prediction": "The video showcases a man dribbling a basketball, dunking a basketball, and passing a basketball. The man is seen moving around the court while performing these actions. The video captures the man's movements and the sound of the ball bouncing on the court. The man's dribbling skills are impressive, and he seems to be in control of the ball at all times. The dunking and passing actions are also executed with precision, and the man's movements are fluid and graceful. Overall, the video is a great display of basketball skills and is sure to impress any basketball",
"gt-answer": "A. dunking a basketball",
"rating": 0
}
```
### Submit Example
For example, if you want to submit Video-LLaVA's result in the leaderboard, you need to:
1. Fill in āVideo-LLaVAā in āModel Nameā if it is your first time to submit your result (You can leave āRevision Model Nameā blank).
2. Fill in āVideo-LLaVAā in āRevision Model Nameā if you want to update your result (You can leave āModel Nameā blank).
3. Select āImageLLMā in āModel Typeā.
4. Fill in āhttps://github.com/x/xā in āModel Linkā.
5. Fill in ā7Bā in āModel sizeā.
6. Upload `.json`.
7. Click the āSubmit Evalā button.
8. Click āRefreshā to obtain the uploaded leaderboard.
"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
@article{liu2024tempcompass,
title = {TempCompass: Do Video LLMs Really Understand Videos?},
author = {Yuanxin Liu and Shicheng Li and Yi Liu and Yuxiang Wang and Shuhuai Ren and Lei Li and Sishuo Chen and Xu Sun and Lu Hou},
year = {2024},
journal = {arXiv preprint arXiv: 2403.00476}
}
"""