Spaces:

lyx97
/

TempCompass

Running

App Files Files Community

lyx97 commited on Jul 6

Commit

a07085a

•

1 Parent(s): ba19895

commit files to HF hub

Browse files

Files changed (5) hide show

constants.py.bak +98 -0
src/about.py +36 -20
src/submission/check_validity.py +7 -0
src/submission/submit.py +12 -42
src/utils_display.py +99 -0

constants.py.bak ADDED Viewed

	@@ -0,0 +1,98 @@

+# this is .py for store constants
+MODEL_INFO = ["Model"]
+TASK_INFO = ["Avg. All", "Avg. Multi-Choice", "Avg. Yes/No", "Avg. Caption Matching", "Avg. Caption Generation",
+                "Action. Multi-Choice", "Action. Yes/No", "Action. Caption Matching", "Action. Caption Generation",
+                "Direction. Multi-Choice", "Direction. Yes/No", "Direction. Caption Matching", "Direction. Caption Generation",
+                "Speed. Multi-Choice", "Speed. Yes/No", "Speed. Caption Matching", "Speed. Caption Generation",
+                "Event Order. Multi-Choice", "Event Order. Yes/No", "Event Order. Caption Matching", "Event Order. Caption Generation",
+                "Attribute Change. Multi-Choice", "Attribute Change. Yes/No", "Attribute Change. Caption Matching", "Attribute Change. Caption Generation"]
+AVG_INFO = ["Avg. All", "Avg. Multi-Choice", "Avg. Yes/No", "Avg. Caption Matching", "Avg. Caption Generation"]
+DATA_TITILE_TYPE = ["markdown",
+                    "number", "number", "number", "number", "number",
+                    "number", "number", "number", "number",
+                    "number", "number", "number", "number",
+                    "number", "number", "number", "number",
+                    "number", "number", "number", "number",
+                    "number", "number", "number", "number",]
+CSV_DIR = "./file/result.csv"
+# COLUMN_NAMES = MODEL_INFO + TASK_INFO
+COLUMN_NAMES = MODEL_INFO + TASK_INFO
+LEADERBORAD_INTRODUCTION = """
+Welcome to the leaderboard of TempCompass! 🏆
+TempCompass is a benchmark to evaluate the temporal perception ability of Video LLMs. It consists of 410 videos and 7,540 task instructions, covering 11 temporal aspects and 4 task types. Please refer to [our paper](https://arxiv.org/abs/2403.00476) for more details.
+"""
+SUBMIT_INTRODUCTION = """
+# TempCompass Leaderboard
+Welcome to the leaderboard of the Video-Bench! 🏆
+## Submit Instruction
+Run inference and automatic evaluation according to our [github repository](https://github.com/llyx97/TempCompass?tab=readme-ov-file#-quick-start).
+You will obtain the JSON file `<task_type>.json`, where `<task_type>` correspond to one of the four categories: `multi-choice`, `yes_no`, `caption_matching` and `captioning`. (Example files can be found [here](https://github.com/llyx97/TempCompass/tree/main/auto_eval_results/video-llava))
+For `multi-choice`, `yes_no`, `caption_matching`, the evaluation result of each question contains five keys. A specific example is as follows:
+```python
+{
+    "question": "What activity is the monkey engaged in?\\nA. swimming\\nB. running\\nC. climbing\\nD. fighting",
+    "gt-answer": "D. fighting",
+    "video-llm-prediction": "D",
+    "match_success": true,  # whether the video-llm-prediction can be assessed by rule-based matching
+    "rating": 1
+}
+```
+For `captioning`, we prompt chatgpt to answer the multi-choice question, using the Video LLM generated caption as context. An example of evalution result is as follows:
+```python
+{
+    "chatgpt-reasoning": "The video description specifically mentions that the man is dribbling a basketball, dunking a basketball, and passing a basketball.",
+    "chatgpt-answer": "B. dribbling a basketball, C. passing a basketball",
+    "video-llm-prediction": "The video showcases a man dribbling a basketball, dunking a basketball, and passing a basketball. The man is seen moving around the court while performing these actions. The video captures the man's movements and the sound of the ball bouncing on the court. The man's dribbling skills are impressive, and he seems to be in control of the ball at all times. The dunking and passing actions are also executed with precision, and the man's movements are fluid and graceful. Overall, the video is a great display of basketball skills and is sure to impress any basketball",
+    "gt-answer": "A. dunking a basketball",
+    "rating": 0
+}
+```
+### Submit Example
+For example, if you want to submit Video-LLaVA's result in the leaderboard, you need to:
+1. Fill in ‘Video-LLaVA’ in ‘Model Name’ if it is your first time to submit your result (You can leave ‘Revision Model Name’ blank).
+2. Fill in ‘Video-LLaVA’ in ‘Revision Model Name’ if you want to update your result (You can leave ‘Model Name’ blank).
+3. Select ‘ImageLLM’ in ‘Model Type’.
+4. Fill in ‘https://github.com/x/x’ in ‘Model Link’.
+5. Fill in ‘7B’ in ‘Model size’.
+6. Upload `<task_type>.json`.
+7. Click the ‘Submit Eval’ button.
+8. Click ‘Refresh’ to obtain the uploaded leaderboard.
+"""
+TABLE_INTRODUCTION = """In the table below, we summarize each task performance of all the models.
+        We use accurancy(%) as the primary evaluation metric for each tasks.
+    """
+LEADERBORAD_INFO = """
+      Based on powerful Large Language Models (LLMs), recent generative Multimodal Large Language Models (MLLMs) have gained prominence as a pivotal research area, exhibiting remarkable capability for both comprehension and generation.
+      In this work, we address the evaluation of generative comprehension in MLLMs as a preliminary step towards a comprehensive assessment of generative models, by introducing a benchmark named SEED-Bench.
+      SEED-Bench consists of 19K multiple choice questions with accurate human annotations (x6 larger than existing benchmarks), which spans 12 evaluation dimensions including the comprehension of both the image and video modality.
+      We develop an advanced pipeline for generating multiple-choice questions that target specific evaluation dimensions, integrating both automatic filtering and manual verification processes.
+      Multiple-choice questions with groundtruth options derived from human annotation enables an objective and efficient assessment of model performance, eliminating the need for human or GPT intervention during evaluation.
+      We further evaluate the performance of 18 models across all 12 dimensions, covering both the spatial and temporal understanding.
+      By revealing the limitations of existing MLLMs through evaluation results, we aim for SEED-Bench to provide insights for motivating future research.
+"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""
+@article{liu2024tempcompass,
+  title   = {TempCompass: Do Video LLMs Really Understand Videos?},
+  author  = {Yuanxin Liu and Shicheng Li and Yi Liu and Yuxiang Wang and Shuhuai Ren and Lei Li and Sishuo Chen and Xu Sun and Lu Hou},
+  year    = {2024},
+  journal = {arXiv preprint arXiv: 2403.00476}
+}
+"""

src/about.py CHANGED Viewed

@@ -40,33 +40,49 @@ To reproduce our results, here is the commands you can run:
 """
 EVALUATION_QUEUE_TEXT = """
-## Some good practices before submitting a model
-### 1) Make sure you can load your model and tokenizer using AutoClasses:
 ```python
-from transformers import AutoConfig, AutoModel, AutoTokenizer
-config = AutoConfig.from_pretrained("your model name", revision=revision)
-model = AutoModel.from_pretrained("your model name", revision=revision)
-tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
 ```
-If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
-Note: make sure your model is public!
-Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
-### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
-It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
-### 3) Make sure your model has an open license!
-This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
-### 4) Fill up your model card
-When we add extra information about models to the leaderboard, it will be automatically taken from the model card
-## In case of model failure
-If your model is displayed in the `FAILED` category, its execution stopped.
-Make sure you have followed the above steps first.
-If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"

 """
 EVALUATION_QUEUE_TEXT = """
+# TempCompass Leaderboard
+Welcome to the leaderboard of the Video-Bench! 🏆
+## Submit Instruction
+Run inference and automatic evaluation according to our [github repository](https://github.com/llyx97/TempCompass?tab=readme-ov-file#-quick-start).
+You will obtain the JSON file `<task_type>.json`, where `<task_type>` correspond to one of the four categories: `multi-choice`, `yes_no`, `caption_matching` and `captioning`. (Example files can be found [here](https://github.com/llyx97/TempCompass/tree/main/auto_eval_results/video-llava))
+For `multi-choice`, `yes_no`, `caption_matching`, the evaluation result of each question contains five keys. A specific example is as follows:
 ```python
+{
+    "question": "What activity is the monkey engaged in?\\nA. swimming\\nB. running\\nC. climbing\\nD. fighting",
+    "gt-answer": "D. fighting",
+    "video-llm-prediction": "D",
+    "match_success": true,  # whether the video-llm-prediction can be assessed by rule-based matching
+    "rating": 1
+}
 ```
+For `captioning`, we prompt chatgpt to answer the multi-choice question, using the Video LLM generated caption as context. An example of evalution result is as follows:
+```python
+{
+    "chatgpt-reasoning": "The video description specifically mentions that the man is dribbling a basketball, dunking a basketball, and passing a basketball.",
+    "chatgpt-answer": "B. dribbling a basketball, C. passing a basketball",
+    "video-llm-prediction": "The video showcases a man dribbling a basketball, dunking a basketball, and passing a basketball. The man is seen moving around the court while performing these actions. The video captures the man's movements and the sound of the ball bouncing on the court. The man's dribbling skills are impressive, and he seems to be in control of the ball at all times. The dunking and passing actions are also executed with precision, and the man's movements are fluid and graceful. Overall, the video is a great display of basketball skills and is sure to impress any basketball",
+    "gt-answer": "A. dunking a basketball",
+    "rating": 0
+}
+```
+### Submit Example
+For example, if you want to submit Video-LLaVA's result in the leaderboard, you need to:
+1. Fill in ‘Video-LLaVA’ in ‘Model Name’ if it is your first time to submit your result (You can leave ‘Revision Model Name’ blank).
+2. Fill in ‘Video-LLaVA’ in ‘Revision Model Name’ if you want to update your result (You can leave ‘Model Name’ blank).
+3. Select ‘ImageLLM’ in ‘Model Type’.
+4. Fill in ‘https://github.com/x/x’ in ‘Model Link’.
+5. Fill in ‘7B’ in ‘Model size’.
+6. Upload `<task_type>.json`.
+7. Click the ‘Submit Eval’ button.
+8. Click ‘Refresh’ to obtain the uploaded leaderboard.
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"

src/submission/check_validity.py CHANGED Viewed

@@ -97,3 +97,10 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
                     users_to_submission_dates[organisation].append(info["submitted_time"])
     return set(file_names), users_to_submission_dates

                     users_to_submission_dates[organisation].append(info["submitted_time"])
     return set(file_names), users_to_submission_dates
+def validate_model_size(s):
+    pattern = r'^\d+B$|^-$'
+    if re.match(pattern, s):
+        return s
+    else:
+        return '-'

src/submission/submit.py CHANGED Viewed

@@ -6,21 +6,19 @@ from src.display.formatting import styled_error, styled_message, styled_warning
 from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
 from src.submission.check_validity import (
     already_submitted_models,
-    check_model_card,
-    get_model_size,
-    is_model_on_hub,
 )
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
 def add_new_eval(
     model: str,
-    base_model: str,
     revision: str,
-    precision: str,
-    weight_type: str,
     model_type: str,
 ):
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
@@ -33,7 +31,6 @@ def add_new_eval(
         user_name = model.split("/")[0]
         model_path = model.split("/")[1]
-    precision = precision.split(" ")[0]
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
     if model_type is None or model_type == "":
@@ -43,61 +40,34 @@ def add_new_eval(
     if revision == "":
         revision = "main"
-    # Is the model on the hub?
-    if weight_type in ["Delta", "Adapter"]:
-        base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
-        if not base_model_on_hub:
-            return styled_error(f'Base model "{base_model}" {error}')
-    if not weight_type == "Adapter":
-        model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
-        if not model_on_hub:
-            return styled_error(f'Model "{model}" {error}')
-    # Is the model info correctly filled?
-    try:
-        model_info = API.model_info(repo_id=model, revision=revision)
-    except Exception:
-        return styled_error("Could not get your model information. Please fill it up properly.")
-    model_size = get_model_size(model_info=model_info, precision=precision)
-    # Were the model card and license filled?
-    try:
-        license = model_info.cardData["license"]
-    except Exception:
-        return styled_error("Please select a license for your model")
-    modelcard_OK, error_msg = check_model_card(model)
-    if not modelcard_OK:
-        return styled_error(error_msg)
     # Seems good, creating the eval
     print("Adding new eval")
     eval_entry = {
-        "model": model,
-        "base_model": base_model,
         "revision": revision,
-        "precision": precision,
-        "weight_type": weight_type,
         "status": "PENDING",
         "submitted_time": current_time,
         "model_type": model_type,
-        "likes": model_info.likes,
         "params": model_size,
-        "license": license,
         "private": False,
     }
     # Check for duplicate submission
-    if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
         return styled_warning("This model has been already submitted.")
     print("Creating eval file")
     OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
-    out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))

 from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
 from src.submission.check_validity import (
     already_submitted_models,
+    validate_model_size,
 )
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
 def add_new_eval(
     model: str,
     revision: str,
+    model_size: str,
     model_type: str,
+    model_link: str,
 ):
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
         user_name = model.split("/")[0]
         model_path = model.split("/")[1]
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
     if model_type is None or model_type == "":
     if revision == "":
         revision = "main"
+    model_size = validate_model_size(model_size)
+    if model_link == '':
+        model_name = model  # no url
+    else:
+        model_name = '[' + model + '](' + model_link + ')'
     # Seems good, creating the eval
     print("Adding new eval")
     eval_entry = {
+        "model_name": model_name,
         "revision": revision,
         "status": "PENDING",
         "submitted_time": current_time,
         "model_type": model_type,
         "params": model_size,
         "private": False,
     }
     # Check for duplicate submission
+    if f"{model}_{revision}" in REQUESTED_MODELS:
         return styled_warning("This model has been already submitted.")
     print("Creating eval file")
     OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
+    out_path = f"{OUT_DIR}/{model_path}_eval_request_False.json"
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))

src/utils_display.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from dataclasses import dataclass
+# These classes are for user facing column names, to avoid having to change them
+# all around the code when a modif is needed
+@dataclass
+class ColumnContent:
+    name: str
+    type: str
+    displayed_by_default: bool
+    hidden: bool = False
+def fields(raw_class):
+    return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
+@dataclass(frozen=True)
+class AutoEvalColumn: # Auto evals column
+    model_type_symbol = ColumnContent("T", "str", True)
+    model = ColumnContent("Model", "markdown", True)
+    average = ColumnContent("Average ⬆️", "number", True)
+    arc = ColumnContent("ARC", "number", True)
+    hellaswag = ColumnContent("HellaSwag", "number", True)
+    mmlu = ColumnContent("MMLU", "number", True)
+    truthfulqa = ColumnContent("TruthfulQA", "number", True)
+    model_type = ColumnContent("Type", "str", False)
+    precision = ColumnContent("Precision", "str", False, True)
+    license = ColumnContent("Hub License", "str", False)
+    params = ColumnContent("#Params (B)", "number", False)
+    likes = ColumnContent("Hub ❤️", "number", False)
+    revision = ColumnContent("Model sha", "str", False, False)
+    dummy = ColumnContent("model_name_for_query", "str", True) # dummy col to implement search bar (hidden by custom CSS)
+@dataclass(frozen=True)
+class EloEvalColumn: # Elo evals column
+    model = ColumnContent("Model", "markdown", True)
+    gpt4 = ColumnContent("GPT-4 (all)", "number", True)
+    human_all = ColumnContent("Human (all)", "number", True)
+    human_instruct = ColumnContent("Human (instruct)", "number", True)
+    human_code_instruct = ColumnContent("Human (code-instruct)", "number", True)
+@dataclass(frozen=True)
+class EvalQueueColumn: # Queue column
+    model = ColumnContent("model", "markdown", True)
+    revision = ColumnContent("revision", "str", True)
+    private = ColumnContent("private", "bool", True)
+    precision = ColumnContent("precision", "bool", True)
+    weight_type = ColumnContent("weight_type", "str", "Original")
+    status = ColumnContent("status", "str", True)
+LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
+KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
+VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
+OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
+DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
+MODEL_PAGE = "https://huggingface.co/models"
+LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
+VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
+ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def make_clickable_model(model_name):
+    link = f"https://huggingface.co/{model_name}"
+    if model_name in LLAMAS:
+        link = LLAMA_LINK
+        model_name = model_name.split("/")[1]
+    elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
+        link = VICUNA_LINK
+        model_name = "stable-vicuna-13b"
+    elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
+        link = ALPACA_LINK
+        model_name = "alpaca-13b"
+    if model_name == "dolly-12b":
+        link = DOLLY_LINK
+    elif model_name == "vicuna-13b":
+        link = VICUNA_LINK
+    elif model_name == "koala-13b":
+        link = KOALA_LINK
+    elif model_name == "oasst-12b":
+        link = OASST_LINK
+    #else:
+    #    link = MODEL_PAGE
+    return model_hyperlink(link, model_name)
+def styled_error(error):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
+def styled_warning(warn):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
+def styled_message(message):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"