tangled-llama-t-128k-base-v0.1 / scripts /prepare_contrain_dataset.py
mtasic85's picture
cognition
96b0f63
raw
history blame
1.38 kB
"""
# cognition
# https://huggingface.co/datasets/Tongjilibo/self_cognition
# instruction
https://huggingface.co/datasets/arcee-ai/The-Tome
https://huggingface.co/datasets/teknium/OpenHermes-2.5
# tool/function calling
https://huggingface.co/datasets/NousResearch/hermes-function-calling-v1
# math
https://huggingface.co/datasets/ai2-adapt-dev/openmath-2-math
# agent
https://huggingface.co/datasets/arcee-ai/agent-data
# role-play
# reflection
# reasoning
https://huggingface.co/datasets/KingNish/reasoning-base-20k
https://huggingface.co/datasets/Magpie-Align/Magpie-Reasoning-150K
https://huggingface.co/datasets/thesven/gsm8k-reasoning
"""
"""
# sft
https://huggingface.co/datasets/HuggingFaceH4/no_robots
https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k
https://huggingface.co/datasets/HuggingFaceH4/deita-10k-v0-sft
https://huggingface.co/datasets/Open-Orca/slimorca-deduped-cleaned-corrected
https://huggingface.co/datasets/arcee-ai/EvolKit-20k
https://huggingface.co/datasets/ise-uiuc/Magicoder-Evol-Instruct-110K
https://huggingface.co/datasets/WizardLMTeam/WizardLM_evol_instruct_V2_196k
https://huggingface.co/datasets/ai2-adapt-dev/olmoe-commercial
# dpo
https://huggingface.co/datasets/allenai/ultrafeedback_binarized_cleaned
https://huggingface.co/datasets/kyujinpy/orca_math_dpo
https://huggingface.co/datasets/argilla/OpenHermesPreferences
"""