Upload 10 files

Browse files

Files changed (10) hide show

base-2.8b-eval-files/EleutherAI-pythia-2.8b-0shot/results.json +404 -0
base-2.8b-eval-files/EleutherAI-pythia-2.8b-5shot/results.json +404 -0
sft-2.8b-eval-files/EleutherAI-pythia-2.8b-0shot-shelloutput.txt +24 -0
sft-2.8b-eval-files/EleutherAI-pythia-2.8b-5shot-shelloutput.txt +24 -0
sft-2.8b-eval-files/dpo-pythia-2.8b-0shot-shelloutput.txt +24 -0
sft-2.8b-eval-files/dpo-pythia-2.8b-5shot-shelloutput.txt +24 -0
sft-2.8b-eval-files/sft-pythia-2.8b-0shot-shelloutput.txt +24 -0
sft-2.8b-eval-files/sft-pythia-2.8b-0shot/results.json +404 -0
sft-2.8b-eval-files/sft-pythia-2.8b-5shot-shelloutput.txt +24 -0
sft-2.8b-eval-files/sft-pythia-2.8b-5shot/results.json +404 -0

base-2.8b-eval-files/EleutherAI-pythia-2.8b-0shot/results.json ADDED Viewed

	@@ -0,0 +1,404 @@

+{
+  "results": {
+    "arc_challenge": {
+      "acc,none": 0.295221843003413,
+      "acc_stderr,none": 0.013329750293382316,
+      "acc_norm,none": 0.3310580204778157,
+      "acc_norm_stderr,none": 0.013752062419817836
+    },
+    "arc_easy": {
+      "acc,none": 0.6447811447811448,
+      "acc_stderr,none": 0.009820245899287126,
+      "acc_norm,none": 0.5871212121212122,
+      "acc_norm_stderr,none": 0.010102837421104667
+    },
+    "boolq": {
+      "acc,none": 0.6474006116207951,
+      "acc_stderr,none": 0.008356412493562119
+    },
+    "hellaswag": {
+      "acc,none": 0.4531965743875722,
+      "acc_stderr,none": 0.004967872475383267,
+      "acc_norm,none": 0.5926110336586338,
+      "acc_norm_stderr,none": 0.00490344168000382
+    },
+    "lambada_openai": {
+      "perplexity,none": 5.0369907596068435,
+      "perplexity_stderr,none": 0.11909165322070424,
+      "acc,none": 0.6471958082670289,
+      "acc_stderr,none": 0.006657279471298494
+    },
+    "openbookqa": {
+      "acc,none": 0.24,
+      "acc_stderr,none": 0.019118866653759753,
+      "acc_norm,none": 0.358,
+      "acc_norm_stderr,none": 0.02146143486285912
+    },
+    "piqa": {
+      "acc,none": 0.7393906420021763,
+      "acc_stderr,none": 0.010241826155811627,
+      "acc_norm,none": 0.735582154515778,
+      "acc_norm_stderr,none": 0.01028978724476717
+    },
+    "sciq": {
+      "acc,none": 0.887,
+      "acc_stderr,none": 0.010016552866696846,
+      "acc_norm,none": 0.835,
+      "acc_norm_stderr,none": 0.011743632866916171
+    },
+    "wikitext": {
+      "word_perplexity,none": 20.05713713364702,
+      "byte_perplexity,none": 1.6394022363791052,
+      "bits_per_byte,none": 0.7131698710530711
+    },
+    "winogrande": {
+      "acc,none": 0.5974743488555643,
+      "acc_stderr,none": 0.013782866831703043
+    }
+  },
+  "configs": {
+    "arc_challenge": {
+      "task": "arc_challenge",
+      "group": [
+        "ai2_arc",
+        "multiple_choice"
+      ],
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "Question: {{question}}\nAnswer:",
+      "doc_to_target": "{{choices.label.index(answerKey)}}",
+      "doc_to_choice": "{{choices.text}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
+    },
+    "arc_easy": {
+      "task": "arc_easy",
+      "group": [
+        "ai2_arc",
+        "multiple_choice"
+      ],
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "Question: {{question}}\nAnswer:",
+      "doc_to_target": "{{choices.label.index(answerKey)}}",
+      "doc_to_choice": "{{choices.text}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
+    },
+    "boolq": {
+      "task": "boolq",
+      "group": [
+        "super-glue-lm-eval-v1"
+      ],
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "{{passage}}\nQuestion: {{question}}?\nAnswer:",
+      "doc_to_target": "label",
+      "doc_to_choice": [
+        "no",
+        "yes"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc"
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "passage"
+    },
+    "hellaswag": {
+      "task": "hellaswag",
+      "group": [
+        "multiple_choice"
+      ],
+      "dataset_path": "hellaswag",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "{% set text = activity_label ~ ': ' ~ ctx_a ~ ' ' ~ ctx_b.capitalize() %}{{text|trim|replace(' [title]', '. ')|regex_replace('\\[.*?\\]', '')|replace('  ', ' ')}}",
+      "doc_to_target": "{{label}}",
+      "doc_to_choice": "{{endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', '  ', ' ')|list}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "lambada_openai": {
+      "task": "lambada_openai",
+      "group": [
+        "lambada",
+        "loglikelihood",
+        "perplexity"
+      ],
+      "dataset_path": "EleutherAI/lambada_openai",
+      "dataset_name": "default",
+      "test_split": "test",
+      "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
+      "doc_to_target": "{{' '+text.split(' ')[-1]}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "perplexity",
+          "aggregation": "perplexity",
+          "higher_is_better": false
+        },
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "loglikelihood",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{text}}"
+    },
+    "openbookqa": {
+      "task": "openbookqa",
+      "group": [
+        "multiple_choice"
+      ],
+      "dataset_path": "openbookqa",
+      "dataset_name": "main",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "question_stem",
+      "doc_to_target": "{{choices.label.index(answerKey.lstrip())}}",
+      "doc_to_choice": "{{choices.text}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "question_stem"
+    },
+    "piqa": {
+      "task": "piqa",
+      "group": [
+        "multiple_choice"
+      ],
+      "dataset_path": "piqa",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "Question: {{goal}}\nAnswer:",
+      "doc_to_target": "label",
+      "doc_to_choice": "{{[sol1, sol2]}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "goal"
+    },
+    "sciq": {
+      "task": "sciq",
+      "group": [
+        "multiple_choice"
+      ],
+      "dataset_path": "sciq",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:",
+      "doc_to_target": 3,
+      "doc_to_choice": "{{[distractor1, distractor2, distractor3, correct_answer]}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{support}} {{question}}"
+    },
+    "wikitext": {
+      "task": "wikitext",
+      "group": [
+        "perplexity",
+        "loglikelihood_rolling"
+      ],
+      "dataset_path": "EleutherAI/wikitext_document_level",
+      "dataset_name": "wikitext-2-raw-v1",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "",
+      "doc_to_target": "<function wikitext_detokenizer at 0x7fbdfeb25120>",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "word_perplexity"
+        },
+        {
+          "metric": "byte_perplexity"
+        },
+        {
+          "metric": "bits_per_byte"
+        }
+      ],
+      "output_type": "loglikelihood_rolling",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{page}}"
+    },
+    "winogrande": {
+      "task": "winogrande",
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7fbdfeb25360>",
+      "doc_to_target": "<function doc_to_target at 0x7fbdfeb256c0>",
+      "doc_to_choice": "<function doc_to_choice at 0x7fbdfeb25a20>",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "sentence"
+    }
+  },
+  "versions": {
+    "arc_challenge": "Yaml",
+    "arc_easy": "Yaml",
+    "boolq": "Yaml",
+    "hellaswag": "Yaml",
+    "lambada_openai": "Yaml",
+    "openbookqa": "Yaml",
+    "piqa": "Yaml",
+    "sciq": "Yaml",
+    "wikitext": "Yaml",
+    "winogrande": "Yaml"
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=EleutherAI/pythia-2.8b",
+    "batch_size": "16",
+    "batch_sizes": [],
+    "device": null,
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000
+  },
+  "git_hash": "d1a44c8"
+}

base-2.8b-eval-files/EleutherAI-pythia-2.8b-5shot/results.json ADDED Viewed

	@@ -0,0 +1,404 @@

+{
+  "results": {
+    "arc_challenge": {
+      "acc,none": 0.3122866894197952,
+      "acc_stderr,none": 0.013542598541688065,
+      "acc_norm,none": 0.35494880546075086,
+      "acc_norm_stderr,none": 0.01398303690409409
+    },
+    "arc_easy": {
+      "acc,none": 0.67003367003367,
+      "acc_stderr,none": 0.009648311574241036,
+      "acc_norm,none": 0.6759259259259259,
+      "acc_norm_stderr,none": 0.009603728850095387
+    },
+    "boolq": {
+      "acc,none": 0.6663608562691131,
+      "acc_stderr,none": 0.008246805985556868
+    },
+    "hellaswag": {
+      "acc,none": 0.450408285202151,
+      "acc_stderr,none": 0.0049651776330499236,
+      "acc_norm,none": 0.6030671181039634,
+      "acc_norm_stderr,none": 0.0048826194841666
+    },
+    "lambada_openai": {
+      "perplexity,none": 6.447861085824152,
+      "perplexity_stderr,none": 0.16023935097204245,
+      "acc,none": 0.5947991461284688,
+      "acc_stderr,none": 0.0068396269826581525
+    },
+    "openbookqa": {
+      "acc,none": 0.248,
+      "acc_stderr,none": 0.019332342821239103,
+      "acc_norm,none": 0.372,
+      "acc_norm_stderr,none": 0.0216371979857224
+    },
+    "piqa": {
+      "acc,none": 0.7426550598476604,
+      "acc_stderr,none": 0.01019992106479251,
+      "acc_norm,none": 0.7442872687704026,
+      "acc_norm_stderr,none": 0.010178690109459857
+    },
+    "sciq": {
+      "acc,none": 0.944,
+      "acc_stderr,none": 0.007274401481697066,
+      "acc_norm,none": 0.94,
+      "acc_norm_stderr,none": 0.007513751157474921
+    },
+    "wikitext": {
+      "word_perplexity,none": 20.05713713364702,
+      "byte_perplexity,none": 1.6394022363791052,
+      "bits_per_byte,none": 0.7131698710530711
+    },
+    "winogrande": {
+      "acc,none": 0.5943172849250198,
+      "acc_stderr,none": 0.013800206336014201
+    }
+  },
+  "configs": {
+    "arc_challenge": {
+      "task": "arc_challenge",
+      "group": [
+        "ai2_arc",
+        "multiple_choice"
+      ],
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "Question: {{question}}\nAnswer:",
+      "doc_to_target": "{{choices.label.index(answerKey)}}",
+      "doc_to_choice": "{{choices.text}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 5,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
+    },
+    "arc_easy": {
+      "task": "arc_easy",
+      "group": [
+        "ai2_arc",
+        "multiple_choice"
+      ],
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "Question: {{question}}\nAnswer:",
+      "doc_to_target": "{{choices.label.index(answerKey)}}",
+      "doc_to_choice": "{{choices.text}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 5,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
+    },
+    "boolq": {
+      "task": "boolq",
+      "group": [
+        "super-glue-lm-eval-v1"
+      ],
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "{{passage}}\nQuestion: {{question}}?\nAnswer:",
+      "doc_to_target": "label",
+      "doc_to_choice": [
+        "no",
+        "yes"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 5,
+      "metric_list": [
+        {
+          "metric": "acc"
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "passage"
+    },
+    "hellaswag": {
+      "task": "hellaswag",
+      "group": [
+        "multiple_choice"
+      ],
+      "dataset_path": "hellaswag",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "{% set text = activity_label ~ ': ' ~ ctx_a ~ ' ' ~ ctx_b.capitalize() %}{{text|trim|replace(' [title]', '. ')|regex_replace('\\[.*?\\]', '')|replace('  ', ' ')}}",
+      "doc_to_target": "{{label}}",
+      "doc_to_choice": "{{endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', '  ', ' ')|list}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 5,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "lambada_openai": {
+      "task": "lambada_openai",
+      "group": [
+        "lambada",
+        "loglikelihood",
+        "perplexity"
+      ],
+      "dataset_path": "EleutherAI/lambada_openai",
+      "dataset_name": "default",
+      "test_split": "test",
+      "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
+      "doc_to_target": "{{' '+text.split(' ')[-1]}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 5,
+      "metric_list": [
+        {
+          "metric": "perplexity",
+          "aggregation": "perplexity",
+          "higher_is_better": false
+        },
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "loglikelihood",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{text}}"
+    },
+    "openbookqa": {
+      "task": "openbookqa",
+      "group": [
+        "multiple_choice"
+      ],
+      "dataset_path": "openbookqa",
+      "dataset_name": "main",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "question_stem",
+      "doc_to_target": "{{choices.label.index(answerKey.lstrip())}}",
+      "doc_to_choice": "{{choices.text}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 5,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "question_stem"
+    },
+    "piqa": {
+      "task": "piqa",
+      "group": [
+        "multiple_choice"
+      ],
+      "dataset_path": "piqa",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "Question: {{goal}}\nAnswer:",
+      "doc_to_target": "label",
+      "doc_to_choice": "{{[sol1, sol2]}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 5,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "goal"
+    },
+    "sciq": {
+      "task": "sciq",
+      "group": [
+        "multiple_choice"
+      ],
+      "dataset_path": "sciq",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:",
+      "doc_to_target": 3,
+      "doc_to_choice": "{{[distractor1, distractor2, distractor3, correct_answer]}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 5,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{support}} {{question}}"
+    },
+    "wikitext": {
+      "task": "wikitext",
+      "group": [
+        "perplexity",
+        "loglikelihood_rolling"
+      ],
+      "dataset_path": "EleutherAI/wikitext_document_level",
+      "dataset_name": "wikitext-2-raw-v1",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "",
+      "doc_to_target": "<function wikitext_detokenizer at 0x7f80942fd120>",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 5,
+      "metric_list": [
+        {
+          "metric": "word_perplexity"
+        },
+        {
+          "metric": "byte_perplexity"
+        },
+        {
+          "metric": "bits_per_byte"
+        }
+      ],
+      "output_type": "loglikelihood_rolling",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{page}}"
+    },
+    "winogrande": {
+      "task": "winogrande",
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f80942fd360>",
+      "doc_to_target": "<function doc_to_target at 0x7f80942fd6c0>",
+      "doc_to_choice": "<function doc_to_choice at 0x7f80942fda20>",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 5,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "sentence"
+    }
+  },
+  "versions": {
+    "arc_challenge": "Yaml",
+    "arc_easy": "Yaml",
+    "boolq": "Yaml",
+    "hellaswag": "Yaml",
+    "lambada_openai": "Yaml",
+    "openbookqa": "Yaml",
+    "piqa": "Yaml",
+    "sciq": "Yaml",
+    "wikitext": "Yaml",
+    "winogrande": "Yaml"
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=EleutherAI/pythia-2.8b",
+    "batch_size": "8",
+    "batch_sizes": [],
+    "device": null,
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000
+  },
+  "git_hash": "d1a44c8"
+}

sft-2.8b-eval-files/EleutherAI-pythia-2.8b-0shot-shelloutput.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+bootstrapping for stddev: perplexity
+hf (pretrained=EleutherAI/pythia-2.8b), limit: None, num_fewshot: 0, batch_size: 16
+|     Task     |Version|Filter|    Metric     | Value |   |Stderr|
+|--------------|-------|------|---------------|------:|---|-----:|
+|arc_challenge |Yaml   |none  |acc            | 0.2952|±  |0.0133|
+|              |       |none  |acc_norm       | 0.3311|±  |0.0138|
+|arc_easy      |Yaml   |none  |acc            | 0.6448|±  |0.0098|
+|              |       |none  |acc_norm       | 0.5871|±  |0.0101|
+|boolq         |Yaml   |none  |acc            | 0.6474|±  |0.0084|
+|hellaswag     |Yaml   |none  |acc            | 0.4532|±  |0.0050|
+|              |       |none  |acc_norm       | 0.5926|±  |0.0049|
+|lambada_openai|Yaml   |none  |perplexity     | 5.0370|±  |0.1191|
+|              |       |none  |acc            | 0.6472|±  |0.0067|
+|openbookqa    |Yaml   |none  |acc            | 0.2400|±  |0.0191|
+|              |       |none  |acc_norm       | 0.3580|±  |0.0215|
+|piqa          |Yaml   |none  |acc            | 0.7394|±  |0.0102|
+|              |       |none  |acc_norm       | 0.7356|±  |0.0103|
+|sciq          |Yaml   |none  |acc            | 0.8870|±  |0.0100|
+|              |       |none  |acc_norm       | 0.8350|±  |0.0117|
+|wikitext      |Yaml   |none  |word_perplexity|20.0571|   |      |
+|              |       |none  |byte_perplexity| 1.6394|   |      |
+|              |       |none  |bits_per_byte  | 0.7132|   |      |
+|winogrande    |Yaml   |none  |acc            | 0.5975|±  |0.0138|

sft-2.8b-eval-files/EleutherAI-pythia-2.8b-5shot-shelloutput.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+bootstrapping for stddev: perplexity
+hf (pretrained=EleutherAI/pythia-2.8b), limit: None, num_fewshot: 5, batch_size: 8
+|     Task     |Version|Filter|    Metric     | Value |   |Stderr|
+|--------------|-------|------|---------------|------:|---|-----:|
+|arc_challenge |Yaml   |none  |acc            | 0.3123|±  |0.0135|
+|              |       |none  |acc_norm       | 0.3549|±  |0.0140|
+|arc_easy      |Yaml   |none  |acc            | 0.6700|±  |0.0096|
+|              |       |none  |acc_norm       | 0.6759|±  |0.0096|
+|boolq         |Yaml   |none  |acc            | 0.6664|±  |0.0082|
+|hellaswag     |Yaml   |none  |acc            | 0.4504|±  |0.0050|
+|              |       |none  |acc_norm       | 0.6031|±  |0.0049|
+|lambada_openai|Yaml   |none  |perplexity     | 6.4479|±  |0.1602|
+|              |       |none  |acc            | 0.5948|±  |0.0068|
+|openbookqa    |Yaml   |none  |acc            | 0.2480|±  |0.0193|
+|              |       |none  |acc_norm       | 0.3720|±  |0.0216|
+|piqa          |Yaml   |none  |acc            | 0.7427|±  |0.0102|
+|              |       |none  |acc_norm       | 0.7443|±  |0.0102|
+|sciq          |Yaml   |none  |acc            | 0.9440|±  |0.0073|
+|              |       |none  |acc_norm       | 0.9400|±  |0.0075|
+|wikitext      |Yaml   |none  |word_perplexity|20.0571|   |      |
+|              |       |none  |byte_perplexity| 1.6394|   |      |
+|              |       |none  |bits_per_byte  | 0.7132|   |      |
+|winogrande    |Yaml   |none  |acc            | 0.5943|±  |0.0138|

sft-2.8b-eval-files/dpo-pythia-2.8b-0shot-shelloutput.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+bootstrapping for stddev: perplexity
+hf (pretrained=lomahony/eleuther-pythia2.8b-hh-dpo), limit: None, num_fewshot: 0, batch_size: 16
+|     Task     |Version|Filter|    Metric     | Value |   |Stderr|
+|--------------|-------|------|---------------|------:|---|-----:|
+|arc_challenge |Yaml   |none  |acc            | 0.3302|±  |0.0137|
+|              |       |none  |acc_norm       | 0.3490|±  |0.0139|
+|arc_easy      |Yaml   |none  |acc            | 0.6625|±  |0.0097|
+|              |       |none  |acc_norm       | 0.5918|±  |0.0101|
+|boolq         |Yaml   |none  |acc            | 0.6248|±  |0.0085|
+|hellaswag     |Yaml   |none  |acc            | 0.4677|±  |0.0050|
+|              |       |none  |acc_norm       | 0.6072|±  |0.0049|
+|lambada_openai|Yaml   |none  |perplexity     | 4.4821|±  |0.1220|
+|              |       |none  |acc            | 0.6350|±  |0.0067|
+|openbookqa    |Yaml   |none  |acc            | 0.2640|±  |0.0197|
+|              |       |none  |acc_norm       | 0.3960|±  |0.0219|
+|piqa          |Yaml   |none  |acc            | 0.7535|±  |0.0101|
+|              |       |none  |acc_norm       | 0.7454|±  |0.0102|
+|sciq          |Yaml   |none  |acc            | 0.8630|±  |0.0109|
+|              |       |none  |acc_norm       | 0.8030|±  |0.0126|
+|wikitext      |Yaml   |none  |word_perplexity|21.9279|   |      |
+|              |       |none  |byte_perplexity| 1.6637|   |      |
+|              |       |none  |bits_per_byte  | 0.7344|   |      |
+|winogrande    |Yaml   |none  |acc            | 0.5967|±  |0.0138|

sft-2.8b-eval-files/dpo-pythia-2.8b-5shot-shelloutput.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+bootstrapping for stddev: perplexity
+hf (pretrained=lomahony/eleuther-pythia2.8b-hh-dpo), limit: None, num_fewshot: 5, batch_size: 8
+|     Task     |Version|Filter|    Metric     | Value |   |Stderr|
+|--------------|-------|------|---------------|------:|---|-----:|
+|arc_challenge |Yaml   |none  |acc            | 0.3353|±  |0.0138|
+|              |       |none  |acc_norm       | 0.3788|±  |0.0142|
+|arc_easy      |Yaml   |none  |acc            | 0.6890|±  |0.0095|
+|              |       |none  |acc_norm       | 0.6936|±  |0.0095|
+|boolq         |Yaml   |none  |acc            | 0.6495|±  |0.0083|
+|hellaswag     |Yaml   |none  |acc            | 0.4680|±  |0.0050|
+|              |       |none  |acc_norm       | 0.6124|±  |0.0049|
+|lambada_openai|Yaml   |none  |perplexity     | 5.9966|±  |0.1685|
+|              |       |none  |acc            | 0.5789|±  |0.0069|
+|openbookqa    |Yaml   |none  |acc            | 0.2720|±  |0.0199|
+|              |       |none  |acc_norm       | 0.3740|±  |0.0217|
+|piqa          |Yaml   |none  |acc            | 0.7535|±  |0.0101|
+|              |       |none  |acc_norm       | 0.7573|±  |0.0100|
+|sciq          |Yaml   |none  |acc            | 0.9340|±  |0.0079|
+|              |       |none  |acc_norm       | 0.9190|±  |0.0086|
+|wikitext      |Yaml   |none  |word_perplexity|21.9279|   |      |
+|              |       |none  |byte_perplexity| 1.6637|   |      |
+|              |       |none  |bits_per_byte  | 0.7344|   |      |
+|winogrande    |Yaml   |none  |acc            | 0.5967|±  |0.0138|

sft-2.8b-eval-files/sft-pythia-2.8b-0shot-shelloutput.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+bootstrapping for stddev: perplexity
+hf (pretrained=lomahony/eleuther-pythia2.8b-hh-sft), limit: None, num_fewshot: 0, batch_size: 16
+|     Task     |Version|Filter|    Metric     | Value |   |Stderr|
+|--------------|-------|------|---------------|------:|---|-----:|
+|arc_challenge |Yaml   |none  |acc            | 0.3003|±  |0.0134|
+|              |       |none  |acc_norm       | 0.3268|±  |0.0137|
+|arc_easy      |Yaml   |none  |acc            | 0.6486|±  |0.0098|
+|              |       |none  |acc_norm       | 0.5657|±  |0.0102|
+|boolq         |Yaml   |none  |acc            | 0.6468|±  |0.0084|
+|hellaswag     |Yaml   |none  |acc            | 0.4516|±  |0.0050|
+|              |       |none  |acc_norm       | 0.5870|±  |0.0049|
+|lambada_openai|Yaml   |none  |perplexity     | 4.9120|±  |0.1230|
+|              |       |none  |acc            | 0.6344|±  |0.0067|
+|openbookqa    |Yaml   |none  |acc            | 0.2540|±  |0.0195|
+|              |       |none  |acc_norm       | 0.3700|±  |0.0216|
+|piqa          |Yaml   |none  |acc            | 0.7448|±  |0.0102|
+|              |       |none  |acc_norm       | 0.7405|±  |0.0102|
+|sciq          |Yaml   |none  |acc            | 0.8720|±  |0.0106|
+|              |       |none  |acc_norm       | 0.8010|±  |0.0126|
+|wikitext      |Yaml   |none  |word_perplexity|20.7220|   |      |
+|              |       |none  |byte_perplexity| 1.6482|   |      |
+|              |       |none  |bits_per_byte  | 0.7209|   |      |
+|winogrande    |Yaml   |none  |acc            | 0.5888|±  |0.0138|

sft-2.8b-eval-files/sft-pythia-2.8b-0shot/results.json ADDED Viewed

	@@ -0,0 +1,404 @@

+{
+  "results": {
+    "arc_challenge": {
+      "acc,none": 0.3003412969283277,
+      "acc_stderr,none": 0.013395909309957,
+      "acc_norm,none": 0.3267918088737201,
+      "acc_norm_stderr,none": 0.013706665975587336
+    },
+    "arc_easy": {
+      "acc,none": 0.6485690235690236,
+      "acc_stderr,none": 0.00979639558281772,
+      "acc_norm,none": 0.5656565656565656,
+      "acc_norm_stderr,none": 0.010170943451269428
+    },
+    "boolq": {
+      "acc,none": 0.6467889908256881,
+      "acc_stderr,none": 0.008359705247064303
+    },
+    "hellaswag": {
+      "acc,none": 0.45160326628161723,
+      "acc_stderr,none": 0.004966351835028203,
+      "acc_norm,none": 0.5870344552877913,
+      "acc_norm_stderr,none": 0.004913604782665858
+    },
+    "lambada_openai": {
+      "perplexity,none": 4.911988887150992,
+      "perplexity_stderr,none": 0.12302146370246485,
+      "acc,none": 0.6343877352998254,
+      "acc_stderr,none": 0.006709649590864073
+    },
+    "openbookqa": {
+      "acc,none": 0.254,
+      "acc_stderr,none": 0.01948659680164338,
+      "acc_norm,none": 0.37,
+      "acc_norm_stderr,none": 0.021613289165165785
+    },
+    "piqa": {
+      "acc,none": 0.7448313384113167,
+      "acc_stderr,none": 0.010171571592521822,
+      "acc_norm,none": 0.7404787812840044,
+      "acc_norm_stderr,none": 0.01022793988817392
+    },
+    "sciq": {
+      "acc,none": 0.872,
+      "acc_stderr,none": 0.010570133761108656,
+      "acc_norm,none": 0.801,
+      "acc_norm_stderr,none": 0.012631649083099182
+    },
+    "wikitext": {
+      "word_perplexity,none": 20.722018295590285,
+      "byte_perplexity,none": 1.6482397447214083,
+      "bits_per_byte,none": 0.720926104999623
+    },
+    "winogrande": {
+      "acc,none": 0.5887924230465666,
+      "acc_stderr,none": 0.013829128358676862
+    }
+  },
+  "configs": {
+    "arc_challenge": {
+      "task": "arc_challenge",
+      "group": [
+        "ai2_arc",
+        "multiple_choice"
+      ],
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "Question: {{question}}\nAnswer:",
+      "doc_to_target": "{{choices.label.index(answerKey)}}",
+      "doc_to_choice": "{{choices.text}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
+    },
+    "arc_easy": {
+      "task": "arc_easy",
+      "group": [
+        "ai2_arc",
+        "multiple_choice"
+      ],
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "Question: {{question}}\nAnswer:",
+      "doc_to_target": "{{choices.label.index(answerKey)}}",
+      "doc_to_choice": "{{choices.text}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
+    },
+    "boolq": {
+      "task": "boolq",
+      "group": [
+        "super-glue-lm-eval-v1"
+      ],
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "{{passage}}\nQuestion: {{question}}?\nAnswer:",
+      "doc_to_target": "label",
+      "doc_to_choice": [
+        "no",
+        "yes"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc"
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "passage"
+    },
+    "hellaswag": {
+      "task": "hellaswag",
+      "group": [
+        "multiple_choice"
+      ],
+      "dataset_path": "hellaswag",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "{% set text = activity_label ~ ': ' ~ ctx_a ~ ' ' ~ ctx_b.capitalize() %}{{text|trim|replace(' [title]', '. ')|regex_replace('\\[.*?\\]', '')|replace('  ', ' ')}}",
+      "doc_to_target": "{{label}}",
+      "doc_to_choice": "{{endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', '  ', ' ')|list}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "lambada_openai": {
+      "task": "lambada_openai",
+      "group": [
+        "lambada",
+        "loglikelihood",
+        "perplexity"
+      ],
+      "dataset_path": "EleutherAI/lambada_openai",
+      "dataset_name": "default",
+      "test_split": "test",
+      "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
+      "doc_to_target": "{{' '+text.split(' ')[-1]}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "perplexity",
+          "aggregation": "perplexity",
+          "higher_is_better": false
+        },
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "loglikelihood",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{text}}"
+    },
+    "openbookqa": {
+      "task": "openbookqa",
+      "group": [
+        "multiple_choice"
+      ],
+      "dataset_path": "openbookqa",
+      "dataset_name": "main",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "question_stem",
+      "doc_to_target": "{{choices.label.index(answerKey.lstrip())}}",
+      "doc_to_choice": "{{choices.text}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "question_stem"
+    },
+    "piqa": {
+      "task": "piqa",
+      "group": [
+        "multiple_choice"
+      ],
+      "dataset_path": "piqa",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "Question: {{goal}}\nAnswer:",
+      "doc_to_target": "label",
+      "doc_to_choice": "{{[sol1, sol2]}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "goal"
+    },
+    "sciq": {
+      "task": "sciq",
+      "group": [
+        "multiple_choice"
+      ],
+      "dataset_path": "sciq",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:",
+      "doc_to_target": 3,
+      "doc_to_choice": "{{[distractor1, distractor2, distractor3, correct_answer]}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{support}} {{question}}"
+    },
+    "wikitext": {
+      "task": "wikitext",
+      "group": [
+        "perplexity",
+        "loglikelihood_rolling"
+      ],
+      "dataset_path": "EleutherAI/wikitext_document_level",
+      "dataset_name": "wikitext-2-raw-v1",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "",
+      "doc_to_target": "<function wikitext_detokenizer at 0x7faa07711120>",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "word_perplexity"
+        },
+        {
+          "metric": "byte_perplexity"
+        },
+        {
+          "metric": "bits_per_byte"
+        }
+      ],
+      "output_type": "loglikelihood_rolling",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{page}}"
+    },
+    "winogrande": {
+      "task": "winogrande",
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7faa07711360>",
+      "doc_to_target": "<function doc_to_target at 0x7faa077116c0>",
+      "doc_to_choice": "<function doc_to_choice at 0x7faa07711a20>",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 0,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "sentence"
+    }
+  },
+  "versions": {
+    "arc_challenge": "Yaml",
+    "arc_easy": "Yaml",
+    "boolq": "Yaml",
+    "hellaswag": "Yaml",
+    "lambada_openai": "Yaml",
+    "openbookqa": "Yaml",
+    "piqa": "Yaml",
+    "sciq": "Yaml",
+    "wikitext": "Yaml",
+    "winogrande": "Yaml"
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=lomahony/eleuther-pythia2.8b-hh-sft",
+    "batch_size": "16",
+    "batch_sizes": [],
+    "device": null,
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000
+  },
+  "git_hash": "d1a44c8"
+}

sft-2.8b-eval-files/sft-pythia-2.8b-5shot-shelloutput.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+bootstrapping for stddev: perplexity
+hf (pretrained=lomahony/eleuther-pythia2.8b-hh-sft), limit: None, num_fewshot: 5, batch_size: 8
+|     Task     |Version|Filter|    Metric     | Value |   |Stderr|
+|--------------|-------|------|---------------|------:|---|-----:|
+|arc_challenge |Yaml   |none  |acc            | 0.3319|±  |0.0138|
+|              |       |none  |acc_norm       | 0.3584|±  |0.0140|
+|arc_easy      |Yaml   |none  |acc            | 0.6839|±  |0.0095|
+|              |       |none  |acc_norm       | 0.6843|±  |0.0095|
+|boolq         |Yaml   |none  |acc            | 0.6700|±  |0.0082|
+|hellaswag     |Yaml   |none  |acc            | 0.4509|±  |0.0050|
+|              |       |none  |acc_norm       | 0.5901|±  |0.0049|
+|lambada_openai|Yaml   |none  |perplexity     | 6.5403|±  |0.1706|
+|              |       |none  |acc            | 0.5758|±  |0.0069|
+|openbookqa    |Yaml   |none  |acc            | 0.2560|±  |0.0195|
+|              |       |none  |acc_norm       | 0.3580|±  |0.0215|
+|piqa          |Yaml   |none  |acc            | 0.7486|±  |0.0101|
+|              |       |none  |acc_norm       | 0.7421|±  |0.0102|
+|sciq          |Yaml   |none  |acc            | 0.9410|±  |0.0075|
+|              |       |none  |acc_norm       | 0.9370|±  |0.0077|
+|wikitext      |Yaml   |none  |word_perplexity|20.7220|   |      |
+|              |       |none  |byte_perplexity| 1.6482|   |      |
+|              |       |none  |bits_per_byte  | 0.7209|   |      |
+|winogrande    |Yaml   |none  |acc            | 0.5904|±  |0.0138|

sft-2.8b-eval-files/sft-pythia-2.8b-5shot/results.json ADDED Viewed

	@@ -0,0 +1,404 @@

+{
+  "results": {
+    "arc_challenge": {
+      "acc,none": 0.3319112627986348,
+      "acc_stderr,none": 0.013760988200880536,
+      "acc_norm,none": 0.3583617747440273,
+      "acc_norm_stderr,none": 0.014012883334859862
+    },
+    "arc_easy": {
+      "acc,none": 0.6839225589225589,
+      "acc_stderr,none": 0.009540440071928283,
+      "acc_norm,none": 0.6843434343434344,
+      "acc_norm_stderr,none": 0.009537019245566084
+    },
+    "boolq": {
+      "acc,none": 0.6700305810397553,
+      "acc_stderr,none": 0.008223878741654847
+    },
+    "hellaswag": {
+      "acc,none": 0.4509061939852619,
+      "acc_stderr,none": 0.004965670398127348,
+      "acc_norm,none": 0.5901214897430791,
+      "acc_norm_stderr,none": 0.004908059353503842
+    },
+    "lambada_openai": {
+      "perplexity,none": 6.540250350436668,
+      "perplexity_stderr,none": 0.17057269700844382,
+      "acc,none": 0.5757810983892878,
+      "acc_stderr,none": 0.006885504751619328
+    },
+    "openbookqa": {
+      "acc,none": 0.256,
+      "acc_stderr,none": 0.019536923574747605,
+      "acc_norm,none": 0.358,
+      "acc_norm_stderr,none": 0.021461434862859115
+    },
+    "piqa": {
+      "acc,none": 0.7486398258977149,
+      "acc_stderr,none": 0.010121156016819257,
+      "acc_norm,none": 0.7421109902067464,
+      "acc_norm_stderr,none": 0.010206956662056257
+    },
+    "sciq": {
+      "acc,none": 0.941,
+      "acc_stderr,none": 0.007454835650406723,
+      "acc_norm,none": 0.937,
+      "acc_norm_stderr,none": 0.007687007876286444
+    },
+    "wikitext": {
+      "word_perplexity,none": 20.722018295590285,
+      "byte_perplexity,none": 1.6482397447214083,
+      "bits_per_byte,none": 0.720926104999623
+    },
+    "winogrande": {
+      "acc,none": 0.590370955011839,
+      "acc_stderr,none": 0.013821049109655478
+    }
+  },
+  "configs": {
+    "arc_challenge": {
+      "task": "arc_challenge",
+      "group": [
+        "ai2_arc",
+        "multiple_choice"
+      ],
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Challenge",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "Question: {{question}}\nAnswer:",
+      "doc_to_target": "{{choices.label.index(answerKey)}}",
+      "doc_to_choice": "{{choices.text}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 5,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
+    },
+    "arc_easy": {
+      "task": "arc_easy",
+      "group": [
+        "ai2_arc",
+        "multiple_choice"
+      ],
+      "dataset_path": "ai2_arc",
+      "dataset_name": "ARC-Easy",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "Question: {{question}}\nAnswer:",
+      "doc_to_target": "{{choices.label.index(answerKey)}}",
+      "doc_to_choice": "{{choices.text}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 5,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "Question: {{question}}\nAnswer:"
+    },
+    "boolq": {
+      "task": "boolq",
+      "group": [
+        "super-glue-lm-eval-v1"
+      ],
+      "dataset_path": "super_glue",
+      "dataset_name": "boolq",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "{{passage}}\nQuestion: {{question}}?\nAnswer:",
+      "doc_to_target": "label",
+      "doc_to_choice": [
+        "no",
+        "yes"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 5,
+      "metric_list": [
+        {
+          "metric": "acc"
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "passage"
+    },
+    "hellaswag": {
+      "task": "hellaswag",
+      "group": [
+        "multiple_choice"
+      ],
+      "dataset_path": "hellaswag",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "{% set text = activity_label ~ ': ' ~ ctx_a ~ ' ' ~ ctx_b.capitalize() %}{{text|trim|replace(' [title]', '. ')|regex_replace('\\[.*?\\]', '')|replace('  ', ' ')}}",
+      "doc_to_target": "{{label}}",
+      "doc_to_choice": "{{endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', '  ', ' ')|list}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 5,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "lambada_openai": {
+      "task": "lambada_openai",
+      "group": [
+        "lambada",
+        "loglikelihood",
+        "perplexity"
+      ],
+      "dataset_path": "EleutherAI/lambada_openai",
+      "dataset_name": "default",
+      "test_split": "test",
+      "doc_to_text": "{{text.split(' ')[:-1]|join(' ')}}",
+      "doc_to_target": "{{' '+text.split(' ')[-1]}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 5,
+      "metric_list": [
+        {
+          "metric": "perplexity",
+          "aggregation": "perplexity",
+          "higher_is_better": false
+        },
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "loglikelihood",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{text}}"
+    },
+    "openbookqa": {
+      "task": "openbookqa",
+      "group": [
+        "multiple_choice"
+      ],
+      "dataset_path": "openbookqa",
+      "dataset_name": "main",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "question_stem",
+      "doc_to_target": "{{choices.label.index(answerKey.lstrip())}}",
+      "doc_to_choice": "{{choices.text}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 5,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "question_stem"
+    },
+    "piqa": {
+      "task": "piqa",
+      "group": [
+        "multiple_choice"
+      ],
+      "dataset_path": "piqa",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "Question: {{goal}}\nAnswer:",
+      "doc_to_target": "label",
+      "doc_to_choice": "{{[sol1, sol2]}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 5,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "goal"
+    },
+    "sciq": {
+      "task": "sciq",
+      "group": [
+        "multiple_choice"
+      ],
+      "dataset_path": "sciq",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:",
+      "doc_to_target": 3,
+      "doc_to_choice": "{{[distractor1, distractor2, distractor3, correct_answer]}}",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 5,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{support}} {{question}}"
+    },
+    "wikitext": {
+      "task": "wikitext",
+      "group": [
+        "perplexity",
+        "loglikelihood_rolling"
+      ],
+      "dataset_path": "EleutherAI/wikitext_document_level",
+      "dataset_name": "wikitext-2-raw-v1",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "",
+      "doc_to_target": "<function wikitext_detokenizer at 0x7fbe1b3e5120>",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 5,
+      "metric_list": [
+        {
+          "metric": "word_perplexity"
+        },
+        {
+          "metric": "byte_perplexity"
+        },
+        {
+          "metric": "bits_per_byte"
+        }
+      ],
+      "output_type": "loglikelihood_rolling",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{page}}"
+    },
+    "winogrande": {
+      "task": "winogrande",
+      "dataset_path": "winogrande",
+      "dataset_name": "winogrande_xl",
+      "training_split": "train",
+      "validation_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7fbe1b3e5360>",
+      "doc_to_target": "<function doc_to_target at 0x7fbe1b3e56c0>",
+      "doc_to_choice": "<function doc_to_choice at 0x7fbe1b3e5a20>",
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "num_fewshot": 5,
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "sentence"
+    }
+  },
+  "versions": {
+    "arc_challenge": "Yaml",
+    "arc_easy": "Yaml",
+    "boolq": "Yaml",
+    "hellaswag": "Yaml",
+    "lambada_openai": "Yaml",
+    "openbookqa": "Yaml",
+    "piqa": "Yaml",
+    "sciq": "Yaml",
+    "wikitext": "Yaml",
+    "winogrande": "Yaml"
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=lomahony/eleuther-pythia2.8b-hh-sft",
+    "batch_size": "8",
+    "batch_sizes": [],
+    "device": null,
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000
+  },
+  "git_hash": "d1a44c8"
+}