metadata

license: apache-2.0
pipeline_tag: text-generation
library_name: transformers
language:
  - en
  - am
  - ar
  - as
  - az
  - be
  - bg
  - bn
  - br
  - bs
  - ca
  - cs
  - cy
  - da
  - de
  - el
  - eo
  - es
  - et
  - eu
  - fa
  - ff
  - fi
  - fr
  - fy
  - ga
  - gd
  - gl
  - gn
  - gu
  - ha
  - he
  - hi
  - hr
  - ht
  - hu
  - hy
  - id
  - ig
  - is
  - it
  - ja
  - jv
  - ka
  - kk
  - km
  - kn
  - ko
  - ku
  - ky
  - la
  - lg
  - li
  - ln
  - lo
  - lt
  - lv
  - mg
  - mk
  - ml
  - mn
  - mr
  - ms
  - my
  - ne
  - nl
  - 'no'
  - ns
  - om
  - or
  - pa
  - pl
  - ps
  - pt
  - qu
  - rm
  - ro
  - ru
  - sa
  - si
  - sc
  - sd
  - sk
  - sl
  - so
  - sq
  - sr
  - ss
  - su
  - sv
  - sw
  - ta
  - te
  - th
  - tl
  - tn
  - tr
  - ug
  - uk
  - ur
  - uz
  - vi
  - wo
  - xh
  - yi
  - yo
  - zu
datasets:
  - yahma/alpaca-cleaned
  - saillab/taco-datasets
  - xu-song/cc100-samples
  - badrex/llm-emoji-dataset
  - pszemraj/simple_wikipedia
  - AtlasUnified/Atlas-Reasoning
  - fblgit/simple-math
  - AtlasUnified/atlas-math-sets
  - rvv-karma/Math-QA
  - microsoft/orca-math-word-problems-200k
  - meta-math/MetaMathQA
  - TIGER-Lab/MathInstruct
  - ChuGyouk/WebInstructSub-only-socratic
  - thesven/gsm8k-reasoning
  - AlgorithmicResearchGroup/math_reasoning_autoformalization_track
  - KingNish/reasoning-base-20k
  - fmars/wiki_stem
  - ChuGyouk/WebInstructSub-only-sciencestackexchange
  - bigcode/the-stack-smol-xs
  - cognitivecomputations/dolphin-coder
  - HuggingFaceH4/CodeAlpaca_20K
  - m-a-p/CodeFeedback-Filtered-Instruction
  - NuclearAi/Nuke-X-Glaive-Python-Dataset
  - iamtarun/python_code_instructions_18k_alpaca
  - kloodia/html_200k
  - kloodia/json_200k
  - kloodia/javascript_200k
  - bleugreen/typescript-chunks
  - SkunkworksAI/reasoning-0.01
  - Magpie-Align/Magpie-Reasoning-150K
tags:
  - litgpt
  - litdata

tangled-llama-q-32k-base-v0.1

A pretrained language model based on the Llama model with about 65M parameters. This model has been trained on 16.7B (16,698,858,240) tokens from more than 3.6M (3,597,088) dataset rows.

This model isn't designed for immediate use but rather for Continued Pretraining and Finetuning on a downstream task. While it can handle a context length of up to 128K (131,072) tokens, it was pretrained with sequences of 2K (2048) tokens.

The objective is to streamline the cognitive or reasoning core, eliminating any redundant knowledge from the model.

lm-evaluation-harness

litgpt evaluate --tasks 'hellaswag,gsm8k,truthfulqa_mc2,mmlu,winogrande,arc_challenge' --out_dir 'evaluate-quick/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/

Tasks	Version	Filter	n-shot	Metric		Value		Stderr
arc_challenge	1	none	0	acc	↑	0.1962	±	0.0116
		none	0	acc_norm	↑	0.2304	±	0.0123
gsm8k	3	flexible-extract	5	exact_match	↑	0.0144	±	0.0033
		strict-match	5	exact_match	↑	0.0015	±	0.0011
hellaswag	1	none	0	acc	↑	0.2631	±	0.0044
		none	0	acc_norm	↑	0.2758	±	0.0045
mmlu	2	none		acc	↑	0.2473	±	0.0036
- humanities	2	none		acc	↑	0.2351	±	0.0062
- formal_logic	1	none	0	acc	↑	0.2857	±	0.0404
- high_school_european_history	1	none	0	acc	↑	0.2667	±	0.0345
- high_school_us_history	1	none	0	acc	↑	0.2696	±	0.0311
- high_school_world_history	1	none	0	acc	↑	0.2110	±	0.0266
- international_law	1	none	0	acc	↑	0.1653	±	0.0339
- jurisprudence	1	none	0	acc	↑	0.2870	±	0.0437
- logical_fallacies	1	none	0	acc	↑	0.2331	±	0.0332
- moral_disputes	1	none	0	acc	↑	0.2283	±	0.0226
- moral_scenarios	1	none	0	acc	↑	0.2425	±	0.0143
- philosophy	1	none	0	acc	↑	0.2186	±	0.0235
- prehistory	1	none	0	acc	↑	0.2099	±	0.0227
- professional_law	1	none	0	acc	↑	0.2314	±	0.0108
- world_religions	1	none	0	acc	↑	0.2632	±	0.0338
- other	2	none		acc	↑	0.2485	±	0.0078
- business_ethics	1	none	0	acc	↑	0.2600	±	0.0441
- clinical_knowledge	1	none	0	acc	↑	0.2528	±	0.0267
- college_medicine	1	none	0	acc	↑	0.2254	±	0.0319
- global_facts	1	none	0	acc	↑	0.2700	±	0.0446
- human_aging	1	none	0	acc	↑	0.2377	±	0.0286
- management	1	none	0	acc	↑	0.2816	±	0.0445
- marketing	1	none	0	acc	↑	0.2692	±	0.0291
- medical_genetics	1	none	0	acc	↑	0.2600	±	0.0441
- miscellaneous	1	none	0	acc	↑	0.2350	±	0.0152
- nutrition	1	none	0	acc	↑	0.2549	±	0.0250
- professional_accounting	1	none	0	acc	↑	0.2801	±	0.0268
- professional_medicine	1	none	0	acc	↑	0.2610	±	0.0267
- virology	1	none	0	acc	↑	0.1807	±	0.0300
- social sciences	2	none		acc	↑	0.2658	±	0.0080
- econometrics	1	none	0	acc	↑	0.1930	±	0.0371
- high_school_geography	1	none	0	acc	↑	0.2172	±	0.0294
- high_school_government_and_politics	1	none	0	acc	↑	0.3212	±	0.0337
- high_school_macroeconomics	1	none	0	acc	↑	0.2923	±	0.0231
- high_school_microeconomics	1	none	0	acc	↑	0.3025	±	0.0298
- high_school_psychology	1	none	0	acc	↑	0.2752	±	0.0191
- human_sexuality	1	none	0	acc	↑	0.2290	±	0.0369
- professional_psychology	1	none	0	acc	↑	0.2386	±	0.0172
- public_relations	1	none	0	acc	↑	0.2636	±	0.0422
- security_studies	1	none	0	acc	↑	0.3143	±	0.0297
- sociology	1	none	0	acc	↑	0.2338	±	0.0299
- us_foreign_policy	1	none	0	acc	↑	0.2600	±	0.0441
- stem	2	none		acc	↑	0.2464	±	0.0077
- abstract_algebra	1	none	0	acc	↑	0.2500	±	0.0435
- anatomy	1	none	0	acc	↑	0.2148	±	0.0355
- astronomy	1	none	0	acc	↑	0.1908	±	0.0320
- college_biology	1	none	0	acc	↑	0.2569	±	0.0365
- college_chemistry	1	none	0	acc	↑	0.2700	±	0.0446
- college_computer_science	1	none	0	acc	↑	0.3500	±	0.0479
- college_mathematics	1	none	0	acc	↑	0.2700	±	0.0446
- college_physics	1	none	0	acc	↑	0.2745	±	0.0444
- computer_security	1	none	0	acc	↑	0.3000	±	0.0461
- conceptual_physics	1	none	0	acc	↑	0.2766	±	0.0292
- electrical_engineering	1	none	0	acc	↑	0.2345	±	0.0353
- elementary_mathematics	1	none	0	acc	↑	0.2566	±	0.0225
- high_school_biology	1	none	0	acc	↑	0.2226	±	0.0237
- high_school_chemistry	1	none	0	acc	↑	0.2217	±	0.0292
- high_school_computer_science	1	none	0	acc	↑	0.2000	±	0.0402
- high_school_mathematics	1	none	0	acc	↑	0.2370	±	0.0259
- high_school_physics	1	none	0	acc	↑	0.2517	±	0.0354
- high_school_statistics	1	none	0	acc	↑	0.2685	±	0.0302
- machine_learning	1	none	0	acc	↑	0.1786	±	0.0364
truthfulqa_mc2	2	none	0	acc	↑	0.4668	±	0.0161
winogrande	1	none	0	acc	↑	0.5012	±	0.0141

Groups	Version	Filter	Metric		Value		Stderr
mmlu	2	none	acc	↑	0.2473	±	0.0036
- humanities	2	none	acc	↑	0.2351	±	0.0062
- other	2	none	acc	↑	0.2485	±	0.0078
- social sciences	2	none	acc	↑	0.2658	±	0.0080
- stem	2	none	acc	↑	0.2464	±	0.0077

litgpt evaluate --tasks 'leaderboard' --out_dir 'evaluate-leaderboard/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/

                       Tasks                           |Version|Filter|n-shot|        Metric         |   |Value |   |Stderr|

|-----------------------------------------------------------|-------|------|-----:|-----------------------|---|-----:|---|------| |leaderboard | N/A| | | | | | | | | - leaderboard_bbh | N/A| | | | | | | | | - leaderboard_bbh_boolean_expressions | 1|none | 3|acc_norm |↑ |0.4600|± |0.0316| | - leaderboard_bbh_causal_judgement | 1|none | 3|acc_norm |↑ |0.5187|± |0.0366| | - leaderboard_bbh_date_understanding | 1|none | 3|acc_norm |↑ |0.1840|± |0.0246| | - leaderboard_bbh_disambiguation_qa | 1|none | 3|acc_norm |↑ |0.3880|± |0.0309| | - leaderboard_bbh_formal_fallacies | 1|none | 3|acc_norm |↑ |0.4680|± |0.0316| | - leaderboard_bbh_geometric_shapes | 1|none | 3|acc_norm |↑ |0.1000|± |0.0190| | - leaderboard_bbh_hyperbaton | 1|none | 3|acc_norm |↑ |0.5160|± |0.0317| | - leaderboard_bbh_logical_deduction_five_objects | 1|none | 3|acc_norm |↑ |0.2080|± |0.0257| | - leaderboard_bbh_logical_deduction_seven_objects | 1|none | 3|acc_norm |↑ |0.1720|± |0.0239| | - leaderboard_bbh_logical_deduction_three_objects | 1|none | 3|acc_norm |↑ |0.3280|± |0.0298| | - leaderboard_bbh_movie_recommendation | 1|none | 3|acc_norm |↑ |0.2640|± |0.0279| | - leaderboard_bbh_navigate | 1|none | 3|acc_norm |↑ |0.5760|± |0.0313| | - leaderboard_bbh_object_counting | 1|none | 3|acc_norm |↑ |0.0520|± |0.0141| | - leaderboard_bbh_penguins_in_a_table | 1|none | 3|acc_norm |↑ |0.2260|± |0.0347| | - leaderboard_bbh_reasoning_about_colored_objects | 1|none | 3|acc_norm |↑ |0.0720|± |0.0164| | - leaderboard_bbh_ruin_names | 1|none | 3|acc_norm |↑ |0.2280|± |0.0266| | - leaderboard_bbh_salient_translation_error_detection | 1|none | 3|acc_norm |↑ |0.1920|± |0.0250| | - leaderboard_bbh_snarks | 1|none | 3|acc_norm |↑ |0.4831|± |0.0376| | - leaderboard_bbh_sports_understanding | 1|none | 3|acc_norm |↑ |0.4600|± |0.0316| | - leaderboard_bbh_temporal_sequences | 1|none | 3|acc_norm |↑ |0.2360|± |0.0269| | - leaderboard_bbh_tracking_shuffled_objects_five_objects | 1|none | 3|acc_norm |↑ |0.2080|± |0.0257| | - leaderboard_bbh_tracking_shuffled_objects_seven_objects| 1|none | 3|acc_norm |↑ |0.1680|± |0.0237| | - leaderboard_bbh_tracking_shuffled_objects_three_objects| 1|none | 3|acc_norm |↑ |0.3040|± |0.0292| | - leaderboard_bbh_web_of_lies | 1|none | 3|acc_norm |↑ |0.4880|± |0.0317| | - leaderboard_gpqa | N/A| | | | | | | | | - leaderboard_gpqa_diamond | 1|none | 0|acc_norm |↑ |0.2121|± |0.0291| | - leaderboard_gpqa_extended | 1|none | 0|acc_norm |↑ |0.2619|± |0.0188| | - leaderboard_gpqa_main | 1|none | 0|acc_norm |↑ |0.2589|± |0.0207| | - leaderboard_ifeval | 3|none | 0|inst_level_loose_acc |↑ |0.1966|± | N/A| | | |none | 0|inst_level_strict_acc |↑ |0.1835|± | N/A| | | |none | 0|prompt_level_loose_acc |↑ |0.1017|± |0.0130| | | |none | 0|prompt_level_strict_acc|↑ |0.0998|± |0.0129| | - leaderboard_math_hard | N/A| | | | | | | | | - leaderboard_math_algebra_hard | 1|none | 4|exact_match |↑ |0.0000|± | 0| | - leaderboard_math_counting_and_prob_hard | 1|none | 4|exact_match |↑ |0.0000|± | 0| | - leaderboard_math_geometry_hard | 1|none | 4|exact_match |↑ |0.0000|± | 0| | - leaderboard_math_intermediate_algebra_hard | 1|none | 4|exact_match |↑ |0.0000|± | 0| | - leaderboard_math_num_theory_hard | 1|none | 4|exact_match |↑ |0.0000|± | 0| | - leaderboard_math_prealgebra_hard | 1|none | 4|exact_match |↑ |0.0000|± | 0| | - leaderboard_math_precalculus_hard | 1|none | 4|exact_match |↑ |0.0000|± | 0| | - leaderboard_mmlu_pro | 0.1|none | 5|acc |↑ |0.1155|± |0.0029| | - leaderboard_musr | N/A| | | | | | | | | - leaderboard_musr_murder_mysteries | 1|none | 0|acc_norm |↑ |0.5040|± |0.0317| | - leaderboard_musr_object_placements | 1|none | 0|acc_norm |↑ |0.3086|± |0.0289| | - leaderboard_musr_team_allocation | 1|none | 0|acc_norm |↑ |0.3400|± |0.0300|

litgpt evaluate --tasks 'bbh_zeroshot,bbh_fewshot,bbh_cot_fewshot,bbh_cot_zeroshot' --out_dir 'evaluate-bigbenchhard/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/

litgpt evaluate --tasks 'mmlu,mmlu_pro' --out_dir 'evaluate-mmlu/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/

litgpt evaluate --tasks 'arc_challenge,boolq,gpqa,hellaswag,openbookqa,piqa,truthfulqa_mc2,winogrande' --out_dir 'evaluate-reasoning/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/

litgpt evaluate --tasks 'mmlu_multilingual,mgsm' --out_dir 'evaluate-multilinguals/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/

litgpt evaluate --tasks 'gsm8k,mathqa' --out_dir 'evaluate-math/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/

litgpt evaluate --tasks 'wikitext,qasper' --out_dir 'evaluate-long/' --batch_size 4 --dtype 'bfloat16' out/pretrain/final/