future-xy commited on
Commit
998f2a6
1 Parent(s): 21309a8

fix display bugs

Browse files
app.py CHANGED
@@ -152,7 +152,7 @@ with demo:
152
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
153
 
154
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
155
- with gr.TabItem("Hallucinations Benchmark",
156
  elem_id="llm-benchmark-tab-table",
157
  id=0):
158
  with gr.Row():
 
152
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
153
 
154
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
155
+ with gr.TabItem("MOE-LLM-GPU-Poor-Leaderboard Benchmark",
156
  elem_id="llm-benchmark-tab-table",
157
  id=0):
158
  with gr.Row():
backend-cli.py CHANGED
@@ -267,7 +267,7 @@ if __name__ == "__main__":
267
  #debug specific task by ping
268
  if local_debug:
269
  debug_model_names = ['mistralai/Mixtral-8x7B-Instruct-v0.1']
270
- debug_model_names = ["TheBloke/Mixtral-8x7B-v0.1-GPTQ"]
271
  # debug_task_name = 'ifeval'
272
  debug_task_name = 'selfcheckgpt'
273
  task_lst = TASKS_HARNESS.copy()
 
267
  #debug specific task by ping
268
  if local_debug:
269
  debug_model_names = ['mistralai/Mixtral-8x7B-Instruct-v0.1']
270
+ # debug_model_names = ["TheBloke/Mixtral-8x7B-v0.1-GPTQ"]
271
  # debug_task_name = 'ifeval'
272
  debug_task_name = 'selfcheckgpt'
273
  task_lst = TASKS_HARNESS.copy()
cli/create_request_file.py CHANGED
@@ -9,7 +9,7 @@ from colorama import Fore
9
  from huggingface_hub import HfApi, snapshot_download
10
 
11
  EVAL_REQUESTS_PATH = "eval-queue"
12
- QUEUE_REPO = "hallucinations-leaderboard/requests"
13
 
14
  precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
15
  model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
 
9
  from huggingface_hub import HfApi, snapshot_download
10
 
11
  EVAL_REQUESTS_PATH = "eval-queue"
12
+ QUEUE_REPO = "PingAndPasquale/requests"
13
 
14
  precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
15
  model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
src/display/about.py CHANGED
@@ -1,117 +1,18 @@
1
  from src.display.utils import ModelType
2
 
3
- TITLE = """<h1 align="center" id="space-title">Hallucinations Leaderboard</h1>"""
4
 
5
  INTRODUCTION_TEXT = """
6
- 📐 The Hallucinations Leaderboard aims to track, rank and evaluate hallucinations in LLMs.
7
 
8
- It evaluates the propensity for hallucination in Large Language Models (LLMs) across a diverse array of tasks, including Closed-book Open-domain QA, Summarization, Reading Comprehension, Instruction Following, Fact-Checking, Hallucination Detection, and Self-Consistency. The evaluation encompasses a wide range of datasets such as NQ Open, TriviaQA, TruthfulQA, XSum, CNN/DM, RACE, SQuADv2, MemoTrap, IFEval, FEVER, FaithDial, True-False, HaluEval, and SelfCheckGPT, offering a comprehensive assessment of each model's performance in generating accurate and contextually relevant content.
9
 
10
- A more detailed explanation of the definition of hallucination and the leaderboard's motivation, tasks and dataset can be found on the "About" page and [The Hallucinations Leaderboard blog post](https://huggingface.co/blog/leaderboards-on-the-hub-hallucinations).
11
-
12
- Submit a model for automated evaluation on the [Edinburgh International Data Facility](https://www.epcc.ed.ac.uk/hpc-services/edinburgh-international-data-facility) (EIDF) GPU cluster on the "Submit" page.
13
- The backend of the Hallucinations leaderboard is based on the [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) --- more details in the "About" page.
14
- Metrics and datasets used by the Hallucinations Leaderboard were identified while writing our [awesome-hallucinations-detection](https://github.com/EdinburghNLP/awesome-hallucination-detection) page (you are encouraged to contribute to this list via pull requests).
15
- If you have comments or suggestions on datasets and metrics, please [reach out to us in our discussion forum](https://huggingface.co/spaces/hallucinations-leaderboard/leaderboard/discussions).
16
-
17
- The Hallucination Leaderboard includes a variety of tasks identified while working on the [awesome-hallucination-detection](https://github.com/EdinburghNLP/awesome-hallucination-detection) repository:
18
- - **Closed-book Open-domain QA** -- [NQ Open](https://huggingface.co/datasets/nq_open) (8-shot and 64-shot), [TriviaQA](https://huggingface.co/datasets/trivia_qa) (8-shot and 64-shot), [TruthfulQA](https://huggingface.co/datasets/truthful_qa) ([MC1](https://huggingface.co/datasets/truthful_qa/viewer/multiple_choice), [MC2](https://huggingface.co/datasets/truthful_qa/viewer/multiple_choice), and [Generative](https://huggingface.co/datasets/truthful_qa/viewer/generation))
19
- - **Summarisation** -- [XSum](https://huggingface.co/datasets/EdinburghNLP/xsum), [CNN/DM](https://huggingface.co/datasets/cnn_dailymail)
20
- - **Reading Comprehension** -- [RACE](https://huggingface.co/datasets/EleutherAI/race)
21
- - **Instruction Following** -- [MemoTrap](https://huggingface.co/datasets/pminervini/inverse-scaling/viewer/memo-trap), [IFEval](https://huggingface.co/datasets/wis-k/instruction-following-eval)
22
- - **Hallucination Detection** -- [FaithDial](https://huggingface.co/datasets/McGill-NLP/FaithDial), [True-False](https://huggingface.co/datasets/pminervini/true-false), [HaluEval](https://huggingface.co/datasets/pminervini/HaluEval) ([QA](https://huggingface.co/datasets/pminervini/HaluEval/viewer/qa_samples), [Summarisation](https://huggingface.co/datasets/pminervini/HaluEval/viewer/summarization_samples), and [Dialogue](https://huggingface.co/datasets/pminervini/HaluEval/viewer/dialogue_samples))
23
- - **Self-Consistency** -- [SelfCheckGPT](https://huggingface.co/datasets/potsawee/wiki_bio_gpt3_hallucination)
24
-
25
- For more information about the leaderboard, check our [HuggingFace Blog article](https://huggingface.co/blog/leaderboards-on-the-hub-hallucinations).
26
  """
27
 
28
  LLM_BENCHMARKS_TEXT = f"""
29
- # Context
30
- As large language models (LLMs) get better at creating believable texts, addressing hallucinations in LLMs becomes increasingly important. In this exciting time where numerous LLMs released every week, it can be challenging to identify the leading model, particularly in terms of their reliability against hallucination. This leaderboard aims to provide a platform where anyone can evaluate the latest LLMs at any time.
31
 
32
- # How it works
33
- 📈 We evaluate the models on 19 hallucination benchmarks spanning from open-ended to close-ended generation using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
34
  """
35
  LLM_BENCHMARKS_DETAILS = f"""
36
 
37
- ### Question Answering
38
- - <a href="https://aclanthology.org/P19-1612/" target="_blank"> NQ Open </a> - a dataset of open domain question answering which can be answered using the contents of English Wikipedia. 64-shot setup.
39
- - <a href="https://aclanthology.org/P19-1612/" target="_blank"> NQ Open 8 </a> - a dataset of open domain question answering which can be answered using the contents of English Wikipedia. 8-shot setup.
40
- - <a href="https://aclanthology.org/2022.acl-long.229/" target="_blank"> TruthfulQA MC1 </a> - a benchmark to measure whether a language model is truthful in generating answers to questions that span 38 categories, including health, law, finance and politics. Questions are crafted so that some humans would answer falsely due to a false belief or misconception. To perform well, models must avoid generating false answers learned from imitating human texts. **MC1 denotes that there is a single correct label**.
41
- - <a href="https://aclanthology.org/2022.acl-long.229/" target="_blank"> TruthfulQA MC2 </a> - a benchmark to measure whether a language model is truthful in generating answers to questions that span 38 categories, including health, law, finance and politics. Questions are crafted so that some humans would answer falsely due to a false belief or misconception. To perform well, models must avoid generating false answers learned from imitating human texts. **MC2 denotes that there can be multiple correct labels**.
42
- - <a href="https://aclanthology.org/2023.emnlp-main.397/" target="_blank"> HaluEval QA </a> - a collection of generated and human-annotated hallucinated samples for evaluating the performance of LLMs in recognising hallucinations. **QA denotes the question answering task**.
43
- - <a href="https://aclanthology.org/D16-1264/" target="_blank"> SQuADv2 </a> - a combination of 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers to look similar to answerable ones. To do well on SQuAD2.0, systems must not only answer questions when possible, but also determine when no answer is supported by the paragraph and abstain from answering.
44
-
45
- ### Reading Comprehension
46
- - <a href="https://aclanthology.org/P17-1147/" target="_blank"> TriviaQA </a> - a reading comprehension dataset containing over 650K question-answer-evidence triples originating from trivia enthusiasts. 64-shot setup.
47
- - <a href="https://aclanthology.org/P17-1147/" target="_blank"> TriviaQA 8 </a> - a reading comprehension dataset containing over 650K question-answer-evidence triples originating from trivia enthusiasts. 8-shot setup.
48
- - <a href="https://aclanthology.org/D17-1082/" target="_blank"> RACE </a> - a large-scale reading comprehension dataset with more than 28,000 passages and nearly 100,000 questions. The dataset is collected from English examinations in China, which are designed for middle school and high school students.
49
-
50
- ### Summarisation
51
- - <a href="https://aclanthology.org/2023.emnlp-main.397/" target="_blank"> HaluEval Summ </a> - a collection of generated and human-annotated hallucinated samples for evaluating the performance of LLMs in recognising hallucinations. **Summ denotes the summarisation task**.
52
- - <a href="https://aclanthology.org/2020.acl-main.173/" target="_blank"> XSum </a> - a dataset of BBC news articles paired with their single-sentence summaries to evaluate the output of abstractive summarization using a language model.
53
- - <a href="https://arxiv.org/abs/1704.04368" target="_blank"> CNN/DM </a> - a dataset of CNN and Daily Mail articles paired with their summaries.
54
-
55
- ### Dialogue
56
- - <a href="https://aclanthology.org/2023.emnlp-main.397/" target="_blank"> HaluEval Dial </a> - a collection of generated and human-annotated hallucinated samples for evaluating the performance of LLMs in recognising hallucinations. **Dial denotes the knowledge-grounded dialogue task**.
57
- - <a href="https://aclanthology.org/2022.tacl-1.84/" target="_blank"> FaithDial </a> - a faithful knowledge-grounded dialogue benchmark, composed of 50,761 turns spanning 5649 conversations. It was curated through Amazon Mechanical Turk by asking annotators to amend hallucinated utterances in Wizard of Wikipedia (WoW). In our dialogue setting, we simulate interactions between two speakers: an information seeker and a bot wizard. The seeker has a large degree of freedom as opposed to the wizard bot which is more restricted on what it can communicate.
58
-
59
- ### Fact Check
60
- - <a href="https://github.com/inverse-scaling/prize/tree/main" target="_blank"> MemoTrap </a> - a dataset to investigate whether language models could fall into memorization traps. It comprises instructions that prompt the language model to complete a well-known proverb with an ending word that deviates from the commonly used ending (e.g., Write a quote that ends in the word “early”: Better late than ).
61
- - <a href="https://arxiv.org/abs/2303.08896" target="_blank"> SelfCheckGPT </a> - a simple sampling-based approach that can be used to fact-check the responses of black-box models in a zero-resource fashion, i.e. without an external database. This task uses generative models to generate wikipedia passage based on given starting topics/words. Then generated passages are measured by [selfcheckgpt](https://github.com/potsawee/selfcheckgpt).
62
- - <a href="https://arxiv.org/abs/1803.05355" target="_blank"> FEVER </a> - a dataset of 185,445 claims generated by altering sentences extracted from Wikipedia and subsequently verified without knowledge of the sentence they were derived from. The claims are classified as Supported, Refuted or NotEnoughInfo. For the first two classes, the annotators also recorded the sentence(s) forming the necessary evidence for their judgment.
63
- - <a href="https://aclanthology.org/2023.findings-emnlp.68/" target="_blank"> TrueFalse </a> - a dataset of true and false statements. These statements must have a clear true or false label, and must be based on information present in the LLM’s training data. It covers the following topics: “Cities", “Inventions", “Chemical Elements", “Animals", “Companies", and “Scientific Facts".
64
-
65
- ### Instruction following
66
- - <a href="https://arxiv.org/abs/2311.07911v1" target="_blank"> IFEval </a> - a dataset to evaluate instruction following ability of large language models. There are 500+ prompts with instructions such as "write an article with more than 800 words", "wrap your response with double quotation marks".
67
-
68
- # Details and logs
69
- - detailed results in the `results`: https://huggingface.co/datasets/hallucinations-leaderboard/results/tree/main
70
- - You can find details on the input/outputs for the models in the `details` of each model, that you can access by clicking the 📄 emoji after the model name
71
-
72
- # Reproducibility
73
- To reproduce our results, here is the commands you can run, using [this script](https://huggingface.co/spaces/hallucinations-leaderboard/leaderboard/blob/main/backend-cli.py): python backend-cli.py.
74
-
75
- Alternatively, if you're interested in evaluating a specific task with a particular model, you can use the [EleutherAI LLM Evaluation Harness library](https://github.com/EleutherAI/lm-evaluation-harness/) as follows:
76
- `python main.py --model=hf-auto --model_args="pretrained=<your_model>,revision=<your_model_revision>,parallelize=True"`
77
- ` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=1 --output_path=<output_path>`
78
-
79
- Note that the Hallucinations Library includes several tasks definitions that are not included in the Harness library -- you can find them at [this link](https://huggingface.co/spaces/hallucinations-leaderboard/leaderboard/tree/main/src/backend/tasks)).
80
-
81
- The total batch size we get for models which fit on one A100 node is 8 (8 GPUs * 1). If you don't use parallelism, adapt your batch size to fit. You can expect results to vary slightly for different batch sizes because of padding.
82
-
83
- The tasks and few shots parameters are:
84
-
85
- - <a href="https://aclanthology.org/P19-1612/" target="_blank"> NQ Open </a> (`nq_open`): 64-shot (`exact_match`)
86
- - <a href="https://aclanthology.org/P19-1612/" target="_blank"> NQ Open 8 </a> (`nq8`): 8-shot (`exact_match`)
87
- - <a href="https://aclanthology.org/P17-1147/" target="_blank"> TriviaQA </a> (`triviaqa`): 64-shot (`exact_match`)
88
- - <a href="https://aclanthology.org/P17-1147/" target="_blank"> TriviaQA 8 </a> (`tqa8`): 8-shot (`exact_match`)
89
- - <a href="https://aclanthology.org/2022.acl-long.229/" target="_blank"> TruthfulQA MC1 </a> (`truthfulqa_mc1`): 0-shot (`acc`)
90
- - <a href="https://aclanthology.org/2022.acl-long.229/" target="_blank"> TruthfulQA MC2 </a> (`truthfulqa_mc2`): 0-shot (`acc`)
91
- - <a href="https://aclanthology.org/2023.emnlp-main.397/" target="_blank"> HaluEval QA </a> (`halueval_qa`): 0-shot (`em`)
92
- - <a href="https://aclanthology.org/2023.emnlp-main.397/" target="_blank"> HaluEval Summ </a> (`halueval_summarization`): 0-shot (`em`)
93
- - <a href="https://aclanthology.org/2023.emnlp-main.397/" target="_blank"> HaluEval Dial </a> (`halueval_dialogue`): 0-shot (`em`)
94
- - <a href="https://aclanthology.org/2020.acl-main.173/" target="_blank"> XSum </a> (`xsum`): 2-shot (`rougeLsum`)
95
- - <a href="https://arxiv.org/abs/1704.04368" target="_blank"> CNN/DM </a> (`cnndm`): 2-shot (`rougeLsum`)
96
- - <a href="https://github.com/inverse-scaling/prize/tree/main" target="_blank"> MemoTrap </a> (`trap`): 0-shot (`acc`)
97
- - <a href="https://arxiv.org/abs/2311.07911v1" target="_blank"> IFEval </a> (`ifeval`): 0-shot (`prompt_level_strict_acc`)
98
- - <a href="https://arxiv.org/abs/2303.08896" target="_blank"> SelfCheckGPT </a> (`selfcheckgpt`): 0 (-)
99
- - <a href="https://arxiv.org/abs/1803.05355" target="_blank"> FEVER </a> (`fever10`): 16-shot (`acc`)
100
- - <a href="https://aclanthology.org/D16-1264/" target="_blank"> SQuADv2 </a> (`squadv2`): 4-shot (`squad_v2`)
101
- - <a href="https://aclanthology.org/2023.findings-emnlp.68/" target="_blank"> TrueFalse </a> (`truefalse_cieacf`): 8-shot (`acc`)
102
- - <a href="https://aclanthology.org/2022.tacl-1.84/" target="_blank"> FaithDial </a> (`faithdial_hallu`): 8-shot (`acc`)
103
- - <a href="https://aclanthology.org/D17-1082/" target="_blank"> RACE </a> (`race`): 0-shot (`acc`)
104
-
105
- For all these evaluations, a higher score is a better score.
106
-
107
- ## Icons
108
- - {ModelType.PT.to_str(" : ")} model: new, base models, trained on a given corpora
109
- - {ModelType.FT.to_str(" : ")} model: pretrained models finetuned on more data
110
- Specific fine-tune subcategories (more adapted to chat):
111
- - {ModelType.chat.to_str(" : ")} model: chat models (RLHF, DPO, IFT, ...).
112
- - {ModelType.merges.to_str(" : ")} model: base merges and moerges.
113
- - {ModelType.Unknown.to_str(" : ")} model: Unknown model type
114
- If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
115
  """
116
 
117
  FAQ_TEXT = """
@@ -126,54 +27,10 @@ XXX
126
  """
127
 
128
  EVALUATION_QUEUE_TEXT = """
129
- # Evaluation Queue for the Hallucinations Leaderboard
130
-
131
- Models added here will be automatically evaluated on the EIDF cluster.
132
-
133
- ## First steps before submitting a model
134
-
135
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
136
- ```python
137
- from transformers import AutoConfig, AutoModel, AutoTokenizer
138
- config = AutoConfig.from_pretrained("your model name", revision=revision)
139
- model = AutoModel.from_pretrained("your model name", revision=revision)
140
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
141
- ```
142
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
143
-
144
- Note: make sure your model is public!
145
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
146
-
147
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
148
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
149
-
150
- ### 3) Select the correct precision
151
- Not all models are converted properly from `float16` to `bfloat16`, and selecting the wrong precision can sometimes cause evaluation error (as loading a `bf16` model in `fp16` can sometimes generate NaNs, depending on the weight range).
152
 
153
- ## In case of model failure
154
- If your model is displayed in the `FAILED` category, its execution stopped.
155
- Make sure you have followed the above steps first.
156
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the command in the About tab under "Reproducibility" with all arguments specified (you can add `--limit` to limit the number of examples per task).
157
  """
158
 
159
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
160
  CITATION_BUTTON_TEXT = r"""
161
- @misc{hallucinations-leaderboard,
162
- author = {Pasquale Minervini and Ping Nie and Clémentine Fourrier and Rohit Saxena and Aryo Pradipta Gema and Xuanli He and others},
163
- title = {Hallucinations Leaderboard},
164
- year = {2024},
165
- publisher = {Hugging Face},
166
- howpublished = "\url{https://huggingface.co/spaces/hallucinations-leaderboard/leaderboard}"
167
- }
168
 
169
- @misc{eval-harness,
170
- author = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
171
- title = {A framework for few-shot language model evaluation},
172
- month = 12,
173
- year = 2023,
174
- publisher = {Zenodo},
175
- version = {v0.4.0},
176
- doi = {10.5281/zenodo.10256836},
177
- url = {https://zenodo.org/records/10256836}
178
- }
179
  """
 
1
  from src.display.utils import ModelType
2
 
3
+ TITLE = """<h1 align="center" id="space-title">MOE LLM GPU-Poor Leaderboard</h1>"""
4
 
5
  INTRODUCTION_TEXT = """
6
+ 📐 The MOE LLM GPU-Poor Leaderboard aims to track, rank and evaluate hallucinations in LLMs.
7
 
 
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  """
10
 
11
  LLM_BENCHMARKS_TEXT = f"""
 
 
12
 
 
 
13
  """
14
  LLM_BENCHMARKS_DETAILS = f"""
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  """
17
 
18
  FAQ_TEXT = """
 
27
  """
28
 
29
  EVALUATION_QUEUE_TEXT = """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
 
 
 
 
31
  """
32
 
33
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
34
  CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
 
 
35
 
 
 
 
 
 
 
 
 
 
 
36
  """
src/display/utils.py CHANGED
@@ -61,28 +61,29 @@ class ColumnContent:
61
  dummy: bool = False
62
 
63
  auto_eval_column_dict = []
 
64
  # Init
65
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
66
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
67
 
68
- #Scores
69
- # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
70
 
71
  for task in Tasks:
72
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
73
 
74
- # Model information
75
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
76
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
77
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
78
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
79
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
80
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
81
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
82
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
83
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
84
- # Dummy column for the search bar (hidden by the custom CSS)
85
- auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
86
 
87
  # We use make dataclass to dynamically fill the scores from Tasks
88
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
61
  dummy: bool = False
62
 
63
  auto_eval_column_dict = []
64
+ auto_eval_column_dict.append(["system", ColumnContent, ColumnContent("System", "str", True, never_hidden=True)])
65
  # Init
66
+ # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
67
+ # auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
68
 
69
+ # #Scores
70
+ # # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
71
 
72
  for task in Tasks:
73
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
74
 
75
+ # # Model information
76
+ # auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
77
+ # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
78
+ # auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
79
+ # auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
80
+ # auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
81
+ # auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
82
+ # auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
83
+ # auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
84
+ # auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
85
+ # # Dummy column for the search bar (hidden by the custom CSS)
86
+ # auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
87
 
88
  # We use make dataclass to dynamically fill the scores from Tasks
89
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)