Madhavan Iyengar commited on
Commit
94f782b
β€’
1 Parent(s): b894895

update about and submit pages

Browse files
Files changed (2) hide show
  1. app.py +54 -78
  2. src/about.py +63 -31
app.py CHANGED
@@ -29,7 +29,10 @@ from src.display.utils import (
29
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
30
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
31
  from src.submission.submit import add_new_eval
32
- pd.set_option('display.float_format', '{:.2f}'.format)
 
 
 
33
 
34
  def restart_space():
35
  API.restart_space(repo_id=REPO_ID)
@@ -155,7 +158,7 @@ with demo:
155
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
156
 
157
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
158
- with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
159
  with gr.Row():
160
  with gr.Column():
161
  with gr.Row():
@@ -239,87 +242,60 @@ with demo:
239
  with gr.Row():
240
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
241
 
242
- with gr.Column():
243
- with gr.Accordion(
244
- f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
245
- open=False,
246
- ):
247
- with gr.Row():
248
- finished_eval_table = gr.components.Dataframe(
249
- value=finished_eval_queue_df,
250
- headers=EVAL_COLS,
251
- datatype=EVAL_TYPES,
252
- row_count=5,
253
- )
254
- with gr.Accordion(
255
- f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
256
- open=False,
257
- ):
258
- with gr.Row():
259
- running_eval_table = gr.components.Dataframe(
260
- value=running_eval_queue_df,
261
- headers=EVAL_COLS,
262
- datatype=EVAL_TYPES,
263
- row_count=5,
264
- )
265
-
266
- with gr.Accordion(
267
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
268
- open=False,
269
- ):
270
- with gr.Row():
271
- pending_eval_table = gr.components.Dataframe(
272
- value=pending_eval_queue_df,
273
- headers=EVAL_COLS,
274
- datatype=EVAL_TYPES,
275
- row_count=5,
276
- )
277
  with gr.Row():
278
- gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
279
 
280
  with gr.Row():
281
- with gr.Column():
282
  model_name_textbox = gr.Textbox(label="Model name")
283
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
284
- model_type = gr.Dropdown(
285
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
286
- label="Model type",
287
- multiselect=False,
288
- value=None,
289
- interactive=True,
290
- )
291
-
292
- with gr.Column():
293
- precision = gr.Dropdown(
294
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
295
- label="Precision",
296
- multiselect=False,
297
- value="float16",
298
- interactive=True,
299
- )
300
- weight_type = gr.Dropdown(
301
- choices=[i.value.name for i in WeightType],
302
- label="Weights type",
303
- multiselect=False,
304
- value="Original",
305
- interactive=True,
306
  )
307
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
308
-
309
- submit_button = gr.Button("Submit Eval")
310
- submission_result = gr.Markdown()
311
- submit_button.click(
312
- add_new_eval,
313
- [
314
- model_name_textbox,
315
- base_model_name_textbox,
316
- revision_name_textbox,
317
- precision,
318
- weight_type,
319
- model_type,
320
- ],
321
- submission_result,
322
- )
323
 
324
  with gr.Row():
325
  with gr.Accordion("πŸ“™ Citation", open=False):
 
29
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
30
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
31
  from src.submission.submit import add_new_eval
32
+
33
+ def handle_new_eval_submission(model_name, model_zip, model_link):
34
+ # This is a placeholder for the actual submission logic
35
+ return "We are not accepting submissions at this time, please check back soon!"
36
 
37
  def restart_space():
38
  API.restart_space(repo_id=REPO_ID)
 
158
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
159
 
160
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
161
+ with gr.TabItem("πŸ… 3D-POPE Benchmark", elem_id="llm-benchmark-tab-table", id=0):
162
  with gr.Row():
163
  with gr.Column():
164
  with gr.Row():
 
242
  with gr.Row():
243
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
244
 
245
+ # with gr.Column():
246
+ # with gr.Accordion(
247
+ # f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
248
+ # open=False,
249
+ # ):
250
+ # with gr.Row():
251
+ # finished_eval_table = gr.components.Dataframe(
252
+ # value=finished_eval_queue_df,
253
+ # headers=EVAL_COLS,
254
+ # datatype=EVAL_TYPES,
255
+ # row_count=5,
256
+ # )
257
+ # with gr.Accordion(
258
+ # f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
259
+ # open=False,
260
+ # ):
261
+ # with gr.Row():
262
+ # running_eval_table = gr.components.Dataframe(
263
+ # value=running_eval_queue_df,
264
+ # headers=EVAL_COLS,
265
+ # datatype=EVAL_TYPES,
266
+ # row_count=5,
267
+ # )
268
+
269
+ # with gr.Accordion(
270
+ # f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
271
+ # open=False,
272
+ # ):
273
+ # with gr.Row():
274
+ # pending_eval_table = gr.components.Dataframe(
275
+ # value=pending_eval_queue_df,
276
+ # headers=EVAL_COLS,
277
+ # datatype=EVAL_TYPES,
278
+ # row_count=5,
279
+ # )
280
  with gr.Row():
281
+ gr.Markdown("# πŸ“‹ Submit your results here!", elem_classes="markdown-text")
282
 
283
  with gr.Row():
 
284
  model_name_textbox = gr.Textbox(label="Model name")
285
+ model_zip_file = gr.File(label="Upload model ZIP file")
286
+ model_link_textbox = gr.Textbox(label="Model link")
287
+ with gr.Row():
288
+ gr.Column()
289
+ with gr.Column(scale=2):
290
+ submit_button = gr.Button("Submit Model")
291
+ submission_result = gr.Markdown()
292
+
293
+ submit_button.click(
294
+ handle_new_eval_submission,
295
+ [model_name_textbox, model_zip_file, model_link_textbox],
296
+ submission_result
 
 
 
 
 
 
 
 
 
 
 
297
  )
298
+ gr.Column()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
 
300
  with gr.Row():
301
  with gr.Accordion("πŸ“™ Citation", open=False):
src/about.py CHANGED
@@ -39,55 +39,87 @@ NUM_FEWSHOT = 0 # Change with your few shot
39
 
40
 
41
  # Your leaderboard name
42
- TITLE = """<h1 align="center" id="space-title">3D-POPE Leaderboard</h1>"""
 
 
 
 
 
 
43
 
44
  # What does your leaderboard evaluate?
45
  INTRODUCTION_TEXT = """
46
  #### This is the official leaderboard for the 3D Polling-based Object Probing Evaluation (3D-POPE) benchmark.
47
-
48
- ###### 3D-POPE is designed to assess a model's ability to accurately identify the presence or absence of objects in a given 3D scene.
49
  """
50
 
51
  # Which evaluations are you running? how can people reproduce what you have?
52
  LLM_BENCHMARKS_TEXT = f"""
53
- ## How it works
 
54
 
55
- ## Reproducibility
56
- To reproduce our results, here is the commands you can run:
57
 
58
- """
59
 
60
- EVALUATION_QUEUE_TEXT = """
61
- ## Some good practices before submitting a model
62
-
63
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
64
- ```python
65
- from transformers import AutoConfig, AutoModel, AutoTokenizer
66
- config = AutoConfig.from_pretrained("your model name", revision=revision)
67
- model = AutoModel.from_pretrained("your model name", revision=revision)
68
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
69
- ```
70
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
71
 
72
- Note: make sure your model is public!
73
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
74
 
75
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
76
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
 
77
 
78
- ### 3) Make sure your model has an open license!
79
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model πŸ€—
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
- ### 4) Fill up your model card
82
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
- ## In case of model failure
85
- If your model is displayed in the `FAILED` category, its execution stopped.
86
- Make sure you have followed the above steps first.
87
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
88
  """
89
 
90
- CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
91
  CITATION_BUTTON_TEXT = r"""
92
  @misc{yang20243dgrand,
93
  title={3D-GRAND: Towards Better Grounding and Less Hallucination for 3D-LLMs},
 
39
 
40
 
41
  # Your leaderboard name
42
+ TITLE = """<h1 align="center" id="space-title">3D-POPE Leaderboard</h1>
43
+ <p><center>
44
+ <a href="https://3d-grand.github.io/" target="_blank">[Project Page]</a>
45
+ <a href="https://www.dropbox.com/scl/fo/5p9nb4kalnz407sbqgemg/AG1KcxeIS_SUoJ1hoLPzv84?rlkey=weunabtbiz17jitfv3f4jpmm1&dl=0" target="_blank">[3D-GRAND Data]</a>
46
+ <a href="https://www.dropbox.com/scl/fo/inemjtgqt2nkckymn65rp/AGi2KSYU9AHbnpuj7TWYihs?rlkey=ldbn36b1z6nqj74yv5ph6cqwc&dl=0" target="_blank">[3D-POPE Data]</a>
47
+ </center></p>
48
+ """
49
 
50
  # What does your leaderboard evaluate?
51
  INTRODUCTION_TEXT = """
52
  #### This is the official leaderboard for the 3D Polling-based Object Probing Evaluation (3D-POPE) benchmark.
 
 
53
  """
54
 
55
  # Which evaluations are you running? how can people reproduce what you have?
56
  LLM_BENCHMARKS_TEXT = f"""
57
+ # 3D-POPE: A Benchmark for Evaluating Hallucination in 3D-LLMs
58
+ ### To systematically evaluate the hallucination behavior of 3D-LLMs, we introduce the 3D Polling-based Object Probing Evaluation (3D-POPE) benchmark. 3D-POPE is designed to assess a model's ability to accurately identify the presence or absence of objects in a given 3D scene.
59
 
60
+ ## Dataset
61
+ To facilitate the 3D-POPE benchmark, we curate a dedicated dataset from the ScanNet dataset, utilizing the semantic classes from ScanNet200. Specifically, we use the ScanNet validation set as the foundation for evaluating 3D-LLMs on the 3D-POPE benchmark.
62
 
63
+ Benchmark design. 3D-POPE consists of a set of triples, each comprising a 3D scene, a posed question, and a binary answer (β€œYes” or β€œNo”) indicating the presence or absence of an object (Fig. 1 middle). To ensure a balanced dataset, we maintain a 1:1 ratio of existent to nonexistent objects when constructing these triples. For the selection of negative samples (i.e., nonexistent objects), we employ three distinct sampling strategies:
64
 
65
+ β€’ Random Sampling: Nonexistent objects are randomly selected from the set of objects not present in the 3D scene.\n
66
+ β€’ Popular Sampling: We select the top-k most frequent objects not present in the 3D scene, where k equals the number of objects currently in the scene.\n
67
+ β€’ Adversarial Sampling: For each positively identified object in the scene, we rank objects that are not present and have not been used as negative samples based on their frequency of co-occurrence with the positive object in the training dataset. The highest-ranking co-occurring object is then selected as the adversarial sample. This approach differs from the original POPE [41] to avoid adversarial samples mirroring popular samples, as indoor scenes often contain similar objects.\n
68
+ These sampling strategies are designed to challenge the model's robustness and assess its susceptibility to different levels of object hallucination.
 
 
 
 
 
 
 
69
 
70
+ ## Metrics
71
+ To evaluate the model's performance on the 3D-POPE benchmark, we use key metrics including Precision, Recall, F1 Score, Accuracy, and Yes (%). Precision and Recall assess the model's ability to correctly affirm the presence of objects and identify the absence of objects, respectively. Precision is particularly important as it indicates the proportion of non-existing objects generated by the 3D-LLMs. The F1 Score, combining Precision and Recall, offers a balanced view of performance and serves as the primary evaluation metric. Accuracy measures the proportion of correctly answered questions, encompassing both β€œYes” and β€œNo” responses. Additionally, the Yes (%) metric reports the ratio of incorrect β€œYes” responses to understand the model’s tendencies regarding object hallucination.
72
 
73
+ ## Leaderboard
74
+ We establish a public leaderboard for the 3D-POPE benchmark, allowing researchers to submit their 3D-LLM results and compare their performance against other state-of-the-art models. The leaderboard reports the evaluation metrics for each model under the three sampling strategies, providing a transparent and standardized way to assess the hallucination performance of 3D-LLMs.
75
+ """
76
 
77
+ EVALUATION_QUEUE_TEXT = """
78
+ # Submitting results from your own model
79
+
80
+ Read the below instructions **carefully** to ensure your submission is properly formatted and complete.
81
+
82
+ You should submit a total of 12 JSON files, each containing outputs generated by your model on [our dataset](https://www.dropbox.com/scl/fo/inemjtgqt2nkckymn65rp/AGi2KSYU9AHbnpuj7TWYihs?rlkey=ldbn36b1z6nqj74yv5ph6cqwc&dl=0).
83
+ These files should be organized within a single ZIP file under the folder structure as follows:
84
+ ```bash
85
+ ❯ tree LEO/json-outputs
86
+ LEO/json-outputs
87
+ β”œβ”€β”€ adversarial_template_1.json
88
+ β”œβ”€β”€ adversarial_template_2.json
89
+ β”œβ”€β”€ adversarial_template_3.json
90
+ β”œβ”€β”€ adversarial_template_4.json
91
+ β”œβ”€β”€ popular_template_1.json
92
+ β”œβ”€β”€ popular_template_2.json
93
+ β”œβ”€β”€ popular_template_3.json
94
+ β”œβ”€β”€ popular_template_4.json
95
+ β”œβ”€β”€ random_template_1.json
96
+ β”œβ”€β”€ random_template_2.json
97
+ β”œβ”€β”€ random_template_3.json
98
+ └── random_template_4.json
99
+ ```
100
 
101
+ Each JSON file should contain data structured exactly like the following example from random_template_1.json:
102
+ ```json
103
+ {
104
+ "random_template_1": [
105
+ {
106
+ "source": "scannet",
107
+ "scene_id": "scene0011_00",
108
+ "question": "Are there any people in the room?",
109
+ "ground_truth_answer": "no",
110
+ "predicted_answer": "yes",
111
+ "template": "template_1",
112
+ "question_type": "random"
113
+ },
114
+ ...
115
+ ]
116
+ }
117
+ ```
118
 
119
+ Submit your properly formatted ZIP file below and your model results will be added to the leaderboard.
 
 
 
120
  """
121
 
122
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite our work"
123
  CITATION_BUTTON_TEXT = r"""
124
  @misc{yang20243dgrand,
125
  title={3D-GRAND: Towards Better Grounding and Less Hallucination for 3D-LLMs},