|
import json |
|
from copy import deepcopy |
|
from dataclasses import dataclass, field |
|
from pathlib import Path |
|
from typing import Any, Literal, Optional, Type, Union |
|
|
|
import torch |
|
from typing_extensions import Self |
|
|
|
from tsai_gpt.utils import find_multiple |
|
|
|
|
|
@dataclass |
|
class Config: |
|
name: str = "" |
|
hf_config: dict = field(default_factory=dict) |
|
block_size: int = 4096 |
|
vocab_size: int = 50254 |
|
padding_multiple: int = 512 |
|
padded_vocab_size: Optional[int] = None |
|
n_layer: int = 16 |
|
n_head: int = 32 |
|
n_embd: int = 4096 |
|
rotary_percentage: float = 0.25 |
|
parallel_residual: bool = True |
|
bias: bool = True |
|
lm_head_bias: bool = False |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n_query_groups: Optional[int] = None |
|
shared_attention_norm: bool = False |
|
_norm_class: Literal["LayerNorm", "RMSNorm"] = "LayerNorm" |
|
norm_eps: float = 1e-5 |
|
_mlp_class: Literal["GptNeoxMLP", "LLaMAMLP"] = "GptNeoxMLP" |
|
gelu_approximate: str = "none" |
|
intermediate_size: Optional[int] = None |
|
rope_condense_ratio: int = 1 |
|
rope_base: int = 10000 |
|
|
|
def __post_init__(self): |
|
if not self.name: |
|
self.name = self.hf_config.get("name", self.name) |
|
|
|
assert self.n_embd % self.n_head == 0 |
|
self.head_size = self.n_embd // self.n_head |
|
|
|
|
|
if self.padded_vocab_size is None: |
|
self.padded_vocab_size = find_multiple(self.vocab_size, self.padding_multiple) |
|
else: |
|
|
|
self.vocab_size = min(self.vocab_size, self.padded_vocab_size) |
|
|
|
|
|
if self.n_query_groups is not None: |
|
assert self.n_head % self.n_query_groups == 0 |
|
else: |
|
self.n_query_groups = self.n_head |
|
|
|
|
|
if self.intermediate_size is None: |
|
if self._mlp_class == "LLaMAMLP": |
|
raise ValueError("The config needs to set the `intermediate_size`") |
|
self.intermediate_size = 4 * self.n_embd |
|
|
|
self.rope_n_elem = int(self.rotary_percentage * self.head_size) |
|
|
|
@classmethod |
|
def from_name(cls, name: str, **kwargs: Any) -> Self: |
|
if name not in name_to_config: |
|
|
|
conf_dict = next(config for config in configs if name == config["hf_config"]["name"]) |
|
else: |
|
conf_dict = name_to_config[name] |
|
|
|
conf_dict = conf_dict.copy() |
|
if "condense_ratio" in kwargs: |
|
kwargs["rope_condense_ratio"] = kwargs.pop("condense_ratio") |
|
conf_dict.update(kwargs) |
|
return cls(**conf_dict) |
|
|
|
@classmethod |
|
def from_json(cls, path: Union[str, Path], **kwargs: Any) -> Self: |
|
with open(path, encoding="utf-8") as fp: |
|
json_kwargs = json.load(fp) |
|
if "condense_ratio" in json_kwargs: |
|
json_kwargs["rope_condense_ratio"] = json_kwargs.pop("condense_ratio") |
|
if "condense_ratio" in kwargs: |
|
kwargs["rope_condense_ratio"] = kwargs.pop("condense_ratio") |
|
if "org" in json_kwargs: |
|
json_kwargs["hf_config"] = {"name": json_kwargs["name"], "org": json_kwargs.pop("org")} |
|
if "org" in kwargs: |
|
kwargs["hf_config"] = { |
|
"name": kwargs.get("name", json_kwargs["name"]), |
|
"org": kwargs.pop("org"), |
|
} |
|
json_kwargs.update(kwargs) |
|
return cls(**json_kwargs) |
|
|
|
@property |
|
def mlp_class(self) -> Type: |
|
|
|
import tsai_gpt.model |
|
|
|
return getattr(tsai_gpt.model, self._mlp_class) |
|
|
|
@property |
|
def norm_class(self) -> Type: |
|
|
|
if self._norm_class == "RMSNorm": |
|
from tsai_gpt.rmsnorm import RMSNorm |
|
|
|
return RMSNorm |
|
return getattr(torch.nn, self._norm_class) |
|
|
|
|
|
|
|
|
|
|
|
configs = [ |
|
|
|
dict( |
|
name="stablelm-base-alpha-3b", |
|
hf_config=dict(org="stabilityai", name="stablelm-base-alpha-3b"), |
|
), |
|
|
|
dict( |
|
name="stablelm-base-alpha-7b", |
|
hf_config=dict(org="stabilityai", name="stablelm-base-alpha-7b"), |
|
n_head=48, |
|
n_embd=6144, |
|
padding_multiple=256, |
|
), |
|
|
|
dict( |
|
name="stablelm-tuned-alpha-3b", |
|
hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-3b"), |
|
n_head=32, |
|
), |
|
|
|
dict( |
|
name="stablelm-tuned-alpha-7b", |
|
hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-7b"), |
|
n_head=48, |
|
n_embd=6144, |
|
padding_multiple=256, |
|
), |
|
] |
|
|
|
|
|
|
|
|
|
pythia = [ |
|
|
|
dict( |
|
name="pythia-70m", |
|
hf_config=dict(org="EleutherAI", name="pythia-70m"), |
|
block_size=2048, |
|
n_layer=6, |
|
n_embd=512, |
|
n_head=8, |
|
padding_multiple=128, |
|
), |
|
|
|
dict( |
|
name="pythia-160m", |
|
hf_config=dict(org="EleutherAI", name="pythia-160m"), |
|
block_size=2048, |
|
n_layer=12, |
|
n_embd=768, |
|
n_head=12, |
|
padding_multiple=128, |
|
), |
|
|
|
dict( |
|
name="pythia-410m", |
|
hf_config=dict(org="EleutherAI", name="pythia-410m"), |
|
block_size=2048, |
|
n_layer=24, |
|
n_embd=1024, |
|
n_head=16, |
|
padding_multiple=128, |
|
), |
|
|
|
dict( |
|
name="pythia-1b", |
|
hf_config=dict(org="EleutherAI", name="pythia-1b"), |
|
block_size=2048, |
|
n_embd=2048, |
|
n_head=8, |
|
padding_multiple=128, |
|
), |
|
|
|
dict( |
|
name="pythia-1.4b", |
|
hf_config=dict(org="EleutherAI", name="pythia-1.4b"), |
|
block_size=2048, |
|
n_layer=24, |
|
n_embd=2048, |
|
n_head=16, |
|
padding_multiple=128, |
|
), |
|
|
|
dict( |
|
name="pythia-2.8b", |
|
hf_config=dict(org="EleutherAI", name="pythia-2.8b"), |
|
block_size=2048, |
|
n_layer=32, |
|
n_embd=2560, |
|
padding_multiple=128, |
|
), |
|
|
|
dict( |
|
name="pythia-6.9b", |
|
hf_config=dict(org="EleutherAI", name="pythia-6.9b"), |
|
block_size=2048, |
|
n_layer=32, |
|
padding_multiple=256, |
|
), |
|
|
|
dict( |
|
name="pythia-12b", |
|
hf_config=dict(org="EleutherAI", name="pythia-12b"), |
|
block_size=2048, |
|
n_layer=36, |
|
n_embd=5120, |
|
n_head=40, |
|
), |
|
] |
|
configs.extend(pythia) |
|
for c in pythia: |
|
copy = c.copy() |
|
copy["name"] = f"{c['name']}-deduped" |
|
copy["hf_config"]["name"] = f"{c['hf_config']['name']}-deduped" |
|
configs.append(copy) |
|
|
|
|
|
|
|
|
|
|
|
redpajama_incite = [ |
|
|
|
dict( |
|
name="RedPajama-INCITE-{}-3B-v1", |
|
hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-3B-v1"), |
|
block_size=2048, |
|
n_layer=32, |
|
n_embd=2560, |
|
padding_multiple=256, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
), |
|
|
|
dict( |
|
name="RedPajama-INCITE-7B-{}", |
|
hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-7B-{}"), |
|
block_size=2048, |
|
n_layer=32, |
|
padding_multiple=256, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
), |
|
|
|
dict( |
|
name="RedPajama-INCITE-{}-7B-v0.1", |
|
hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-7B-v0.1"), |
|
block_size=2048, |
|
n_layer=32, |
|
padding_multiple=256, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
), |
|
] |
|
for c in redpajama_incite: |
|
for kind in ("Base", "Chat", "Instruct"): |
|
copy = c.copy() |
|
copy["name"] = c["name"].format(kind) |
|
copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) |
|
configs.append(copy) |
|
|
|
|
|
|
|
|
|
|
|
falcon = [ |
|
|
|
dict( |
|
name="falcon-7b{}", |
|
hf_config=dict(org="tiiuae", name="falcon-7b{}"), |
|
block_size=2048, |
|
vocab_size=65024, |
|
padded_vocab_size=65024, |
|
n_layer=32, |
|
n_head=71, |
|
n_embd=4544, |
|
rotary_percentage=1.0, |
|
n_query_groups=1, |
|
bias=False, |
|
|
|
shared_attention_norm=True, |
|
), |
|
|
|
dict( |
|
name="falcon-40b{}", |
|
hf_config=dict(org="tiiuae", name="falcon-40b{}"), |
|
block_size=2048, |
|
vocab_size=65024, |
|
padded_vocab_size=65024, |
|
n_layer=60, |
|
n_head=128, |
|
n_embd=8192, |
|
rotary_percentage=1.0, |
|
n_query_groups=8, |
|
bias=False, |
|
), |
|
] |
|
for c in falcon: |
|
for kind in ("", "-instruct"): |
|
copy = c.copy() |
|
copy["name"] = c["name"].format(kind) |
|
copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) |
|
configs.append(copy) |
|
|
|
|
|
falcon180b = dict( |
|
name="falcon-180B{}", |
|
hf_config=dict(org="tiiuae", name="falcon-180B{}"), |
|
block_size=2048, |
|
vocab_size=65024, |
|
padded_vocab_size=65024, |
|
n_layer=80, |
|
n_head=232, |
|
n_embd=14848, |
|
rotary_percentage=1.0, |
|
n_query_groups=8, |
|
bias=False, |
|
) |
|
|
|
for kind in ("", "-chat"): |
|
copy = falcon180b.copy() |
|
copy["name"] = falcon180b["name"].format(kind) |
|
copy["hf_config"]["name"] = falcon180b["hf_config"]["name"].format(kind) |
|
configs.append(copy) |
|
|
|
|
|
|
|
|
|
|
|
open_LLaMA = [ |
|
|
|
dict( |
|
name="open_llama_3b", |
|
hf_config=dict(org="openlm-research", name="open_llama_3b"), |
|
block_size=2048, |
|
vocab_size=32000, |
|
padding_multiple=64, |
|
n_layer=26, |
|
n_embd=3200, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-6, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=8640, |
|
), |
|
|
|
dict( |
|
name="open_llama_7b", |
|
hf_config=dict(org="openlm-research", name="open_llama_7b"), |
|
block_size=2048, |
|
vocab_size=32000, |
|
padding_multiple=64, |
|
n_layer=32, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-6, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=11008, |
|
), |
|
|
|
dict( |
|
name="open_llama_13b", |
|
hf_config=dict(org="openlm-research", name="open_llama_13b"), |
|
block_size=2048, |
|
vocab_size=32000, |
|
padding_multiple=64, |
|
n_layer=40, |
|
n_head=40, |
|
n_embd=5120, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-6, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=13824, |
|
), |
|
] |
|
configs.extend(open_LLaMA) |
|
|
|
|
|
|
|
|
|
|
|
vicuna = [ |
|
|
|
dict( |
|
name="vicuna-7b-v1.3", |
|
hf_config=dict(org="lmsys", name="vicuna-7b-v1.3"), |
|
block_size=2048, |
|
vocab_size=32000, |
|
padding_multiple=64, |
|
n_layer=32, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-6, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=11008, |
|
), |
|
|
|
dict( |
|
name="vicuna-13b-v1.3", |
|
hf_config=dict(org="lmsys", name="vicuna-13b-v1.3"), |
|
block_size=2048, |
|
vocab_size=32000, |
|
padding_multiple=64, |
|
n_layer=40, |
|
n_head=40, |
|
n_embd=5120, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-6, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=13824, |
|
), |
|
|
|
dict( |
|
name="vicuna-33b-v1.3", |
|
hf_config=dict(org="lmsys", name="vicuna-33b-v1.3"), |
|
block_size=2048, |
|
vocab_size=32000, |
|
padding_multiple=64, |
|
n_layer=60, |
|
n_head=52, |
|
n_embd=6656, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-6, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=17920, |
|
), |
|
|
|
dict( |
|
name="vicuna-7b-v1.5", |
|
hf_config=dict(org="lmsys", name="vicuna-7b-v1.5"), |
|
vocab_size=32000, |
|
padding_multiple=64, |
|
n_layer=32, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=11008, |
|
), |
|
|
|
dict( |
|
name="vicuna-7b-v1.5-16k", |
|
hf_config=dict(org="lmsys", name="vicuna-7b-v1.5-16k"), |
|
block_size=16384, |
|
vocab_size=32000, |
|
padding_multiple=64, |
|
n_layer=32, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=11008, |
|
rope_condense_ratio=4, |
|
), |
|
|
|
dict( |
|
name="vicuna-13b-v1.5", |
|
hf_config=dict(org="lmsys", name="vicuna-13b-v1.5"), |
|
vocab_size=32000, |
|
padding_multiple=64, |
|
n_layer=40, |
|
n_head=40, |
|
n_embd=5120, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=13824, |
|
), |
|
|
|
dict( |
|
name="vicuna-13b-v1.5-16k", |
|
hf_config=dict(org="lmsys", name="vicuna-13b-v1.5-16k"), |
|
block_size=16384, |
|
vocab_size=32000, |
|
padding_multiple=64, |
|
n_layer=40, |
|
n_head=40, |
|
n_embd=5120, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=13824, |
|
rope_condense_ratio=4, |
|
), |
|
] |
|
configs.extend(vicuna) |
|
|
|
|
|
|
|
|
|
|
|
long_chat = [ |
|
|
|
dict( |
|
name="longchat-7b-16k", |
|
hf_config=dict(org="lmsys", name="longchat-7b-16k"), |
|
block_size=16384, |
|
vocab_size=32000, |
|
padding_multiple=64, |
|
n_layer=32, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-6, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=11008, |
|
rope_condense_ratio=8, |
|
), |
|
|
|
dict( |
|
name="longchat-13b-16k", |
|
hf_config=dict(org="lmsys", name="longchat-13b-16k"), |
|
block_size=16384, |
|
vocab_size=32000, |
|
padding_multiple=64, |
|
n_layer=40, |
|
n_head=40, |
|
n_embd=5120, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-6, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=13824, |
|
rope_condense_ratio=8, |
|
), |
|
] |
|
configs.extend(long_chat) |
|
|
|
|
|
|
|
|
|
|
|
nous_research = [ |
|
|
|
dict( |
|
name="Nous-Hermes-llama-2-7b", |
|
hf_config=dict(org="NousResearch", name="Nous-Hermes-llama-2-7b"), |
|
padded_vocab_size=32000, |
|
n_layer=32, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-05, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=11008, |
|
), |
|
|
|
dict( |
|
name="Nous-Hermes-13b", |
|
hf_config=dict(org="NousResearch", name="Nous-Hermes-13b"), |
|
block_size=2048, |
|
vocab_size=32000, |
|
padded_vocab_size=32001, |
|
n_layer=40, |
|
n_head=40, |
|
n_embd=5120, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-6, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=13824, |
|
), |
|
|
|
dict( |
|
name="Nous-Hermes-Llama2-13b", |
|
hf_config=dict(org="NousResearch", name="Nous-Hermes-Llama2-13b"), |
|
vocab_size=32000, |
|
padded_vocab_size=32032, |
|
n_layer=40, |
|
n_head=40, |
|
n_embd=5120, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-05, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=13824, |
|
), |
|
] |
|
configs.extend(nous_research) |
|
|
|
|
|
|
|
|
|
|
|
llama_2 = [ |
|
|
|
dict( |
|
name="Llama-2-7b{}-hf", |
|
hf_config=dict(org="meta-llama", name="Llama-2-7b{}-hf"), |
|
vocab_size=32000, |
|
padding_multiple=64, |
|
n_layer=32, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=11008, |
|
), |
|
|
|
dict( |
|
name="Llama-2-13b{}-hf", |
|
hf_config=dict(org="meta-llama", name="Llama-2-13b{}-hf"), |
|
vocab_size=32000, |
|
padding_multiple=64, |
|
n_layer=40, |
|
n_head=40, |
|
n_embd=5120, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=13824, |
|
), |
|
|
|
dict( |
|
name="Llama-2-70b{}-hf", |
|
hf_config=dict(org="meta-llama", name="Llama-2-70b{}-hf"), |
|
vocab_size=32000, |
|
padding_multiple=64, |
|
n_layer=80, |
|
n_head=64, |
|
n_embd=8192, |
|
n_query_groups=8, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=28672, |
|
), |
|
] |
|
for c in llama_2: |
|
for kind in ("", "-chat"): |
|
copy = c.copy() |
|
copy["name"] = c["name"].format(kind) |
|
copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) |
|
configs.append(copy) |
|
|
|
|
|
|
|
|
|
|
|
freewilly_2 = [ |
|
|
|
dict( |
|
name="FreeWilly2", |
|
hf_config=dict(org="stabilityai", name="FreeWilly2"), |
|
vocab_size=32000, |
|
padding_multiple=64, |
|
n_layer=80, |
|
n_head=64, |
|
n_embd=8192, |
|
n_query_groups=8, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=28672, |
|
) |
|
] |
|
configs.extend(freewilly_2) |
|
|
|
|
|
|
|
|
|
|
|
code_llama = [ |
|
|
|
dict( |
|
name="CodeLlama-7b-hf", |
|
hf_config=dict(org="codellama", name="CodeLlama-7b-hf"), |
|
block_size=16384, |
|
vocab_size=32016, |
|
padding_multiple=16, |
|
n_layer=32, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-05, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=11008, |
|
rope_base=1000000, |
|
), |
|
|
|
dict( |
|
name="CodeLlama-13b-hf", |
|
hf_config=dict(org="codellama", name="CodeLlama-13b-hf"), |
|
block_size=16384, |
|
vocab_size=32016, |
|
padding_multiple=16, |
|
n_layer=40, |
|
n_head=40, |
|
n_embd=5120, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-05, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=13824, |
|
rope_base=1000000, |
|
), |
|
|
|
dict( |
|
name="CodeLlama-34b-hf", |
|
hf_config=dict(org="codellama", name="CodeLlama-34b-hf"), |
|
block_size=16384, |
|
vocab_size=32000, |
|
padding_multiple=64, |
|
n_layer=48, |
|
n_head=64, |
|
n_embd=8192, |
|
n_query_groups=8, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-05, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=22016, |
|
rope_base=1000000, |
|
), |
|
|
|
dict( |
|
name="CodeLlama-7b-Python-hf", |
|
hf_config=dict(org="codellama", name="CodeLlama-7b-Python-hf"), |
|
block_size=16384, |
|
vocab_size=32000, |
|
padding_multiple=64, |
|
n_layer=32, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-05, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=11008, |
|
rope_base=1000000, |
|
), |
|
|
|
dict( |
|
name="CodeLlama-13b-Python-hf", |
|
hf_config=dict(org="codellama", name="CodeLlama-13b-Python-hf"), |
|
block_size=16384, |
|
vocab_size=32000, |
|
padding_multiple=64, |
|
n_layer=40, |
|
n_head=40, |
|
n_embd=5120, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-05, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=13824, |
|
rope_base=1000000, |
|
), |
|
|
|
dict( |
|
name="CodeLlama-34b-Python-hf", |
|
hf_config=dict(org="codellama", name="CodeLlama-34b-Python-hf"), |
|
block_size=16384, |
|
vocab_size=32000, |
|
padding_multiple=64, |
|
n_layer=48, |
|
n_head=64, |
|
n_embd=8192, |
|
n_query_groups=8, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-05, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=22016, |
|
rope_base=1000000, |
|
), |
|
|
|
dict( |
|
name="CodeLlama-7b-Instruct-hf", |
|
hf_config=dict(org="codellama", name="CodeLlama-7b-Instruct-hf"), |
|
block_size=16384, |
|
vocab_size=32016, |
|
padding_multiple=16, |
|
n_layer=32, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-05, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=11008, |
|
rope_base=1000000, |
|
), |
|
|
|
dict( |
|
name="CodeLlama-13b-Instruct-hf", |
|
hf_config=dict(org="codellama", name="CodeLlama-13b-Instruct-hf"), |
|
block_size=2048, |
|
vocab_size=32016, |
|
padding_multiple=16, |
|
n_layer=40, |
|
n_head=40, |
|
n_embd=5120, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-05, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=13824, |
|
rope_base=1000000, |
|
), |
|
|
|
dict( |
|
name="CodeLlama-34b-Instruct-hf", |
|
hf_config=dict(org="codellama", name="CodeLlama-34b-Instruct-hf"), |
|
block_size=16384, |
|
vocab_size=32000, |
|
padding_multiple=64, |
|
n_layer=48, |
|
n_head=64, |
|
n_embd=8192, |
|
n_query_groups=8, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-05, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=22016, |
|
rope_base=1000000, |
|
), |
|
] |
|
configs.extend(code_llama) |
|
|
|
|
|
|
|
|
|
|
|
platypus = [ |
|
|
|
dict( |
|
name="Platypus-30B", |
|
hf_config=dict(org="garage-bAInd", name="Platypus-30B"), |
|
block_size=2048, |
|
padded_vocab_size=32000, |
|
n_layer=60, |
|
n_head=52, |
|
n_embd=6656, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-06, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=17920, |
|
), |
|
|
|
dict( |
|
name="Platypus2-7B", |
|
hf_config=dict(org="garage-bAInd", name="Platypus2-7B"), |
|
padded_vocab_size=32000, |
|
n_layer=32, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-05, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=11008, |
|
), |
|
|
|
dict( |
|
name="Platypus2-13B", |
|
hf_config=dict(org="garage-bAInd", name="Platypus2-13B"), |
|
padded_vocab_size=32000, |
|
n_layer=40, |
|
n_head=40, |
|
n_embd=5120, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-05, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=13824, |
|
), |
|
|
|
dict( |
|
name="Platypus2-70B", |
|
hf_config=dict(org="garage-bAInd", name="Platypus2-70B"), |
|
padded_vocab_size=32000, |
|
n_layer=80, |
|
n_head=64, |
|
n_embd=8192, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=28672, |
|
), |
|
|
|
dict( |
|
name="Camel-Platypus2-13B", |
|
hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-13B"), |
|
padded_vocab_size=32000, |
|
n_layer=40, |
|
n_head=40, |
|
n_embd=5120, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=13824, |
|
), |
|
|
|
dict( |
|
name="Camel-Platypus2-70B", |
|
hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-70B"), |
|
padded_vocab_size=32000, |
|
n_layer=80, |
|
n_head=64, |
|
n_embd=8192, |
|
n_query_groups=8, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=28672, |
|
), |
|
|
|
dict( |
|
name="Stable-Platypus2-13B", |
|
hf_config=dict(org="garage-bAInd", name="Stable-Platypus2-13B"), |
|
padded_vocab_size=32000, |
|
n_layer=40, |
|
n_head=40, |
|
n_embd=5120, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=13824, |
|
), |
|
|
|
dict( |
|
name="Platypus2-70B-instruct", |
|
hf_config=dict(org="garage-bAInd", name="Platypus2-70B-instruct"), |
|
padded_vocab_size=32000, |
|
n_layer=80, |
|
n_head=64, |
|
n_embd=8192, |
|
n_query_groups=8, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=28672, |
|
), |
|
] |
|
configs.extend(platypus) |
|
|
|
|
|
|
|
|
|
|
|
stablecode = [ |
|
|
|
dict( |
|
name="stablecode-completion-alpha-3b", |
|
hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b"), |
|
block_size=16384, |
|
vocab_size=49152, |
|
n_layer=32, |
|
n_embd=2560, |
|
), |
|
|
|
dict( |
|
name="stablecode-completion-alpha-3b-4k", |
|
hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b-4k"), |
|
vocab_size=49152, |
|
n_layer=32, |
|
n_embd=2560, |
|
), |
|
|
|
dict( |
|
name="stablecode-instruct-alpha-3b", |
|
hf_config=dict(org="stabilityai", name="stablecode-instruct-alpha-3b"), |
|
vocab_size=49152, |
|
n_layer=32, |
|
n_embd=2560, |
|
), |
|
] |
|
configs.extend(stablecode) |
|
|
|
|
|
|
|
|
|
|
|
together_llama2_32k = [ |
|
|
|
dict( |
|
name="LLaMA-2-7B-32K", |
|
hf_config=dict(org="togethercomputer", name="LLaMA-2-7B-32K"), |
|
vocab_size=32000, |
|
padding_multiple=64, |
|
n_layer=32, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=11008, |
|
rope_condense_ratio=8, |
|
) |
|
] |
|
configs.extend(together_llama2_32k) |
|
|
|
|
|
|
|
|
|
|
|
phi = [ |
|
|
|
dict( |
|
name="phi-1_5", |
|
hf_config=dict(org="microsoft", name="phi-1_5"), |
|
vocab_size=50257, |
|
padded_vocab_size=51200, |
|
block_size=2048, |
|
n_embd=2048, |
|
n_layer=24, |
|
rotary_percentage=0.5, |
|
shared_attention_norm=True, |
|
lm_head_bias=True, |
|
gelu_approximate="tanh", |
|
) |
|
] |
|
configs.extend(phi) |
|
|
|
|
|
|
|
|
|
|
|
mistral = [ |
|
|
|
dict( |
|
name="Mistral-7B-{}v0.1", |
|
hf_config=dict(org="mistralai", name="Mistral-7B-{}v0.1"), |
|
padded_vocab_size=32000, |
|
block_size=4096, |
|
n_layer=32, |
|
n_query_groups=8, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-05, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=14336, |
|
) |
|
] |
|
for c in mistral: |
|
for kind in ("", "Instruct-"): |
|
copy = c.copy() |
|
copy["name"] = c["name"].format(kind) |
|
copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind) |
|
configs.append(copy) |
|
|
|
|
|
|
|
|
|
|
|
tiny_llama = [ |
|
dict( |
|
name="tiny-llama-1.1b", |
|
hf_config=dict(org="PY007", name="TinyLlama-1.1B-intermediate-step-480k-1T"), |
|
block_size=2048, |
|
vocab_size=32000, |
|
padding_multiple=64, |
|
n_layer=22, |
|
n_head=32, |
|
n_embd=2048, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-5, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=5632, |
|
n_query_groups=4, |
|
), |
|
dict( |
|
name="tiny-llama-new", |
|
hf_config=dict(org="PY007", name="TinyLlama-1.1B-intermediate-step-480k-1T"), |
|
block_size=768, |
|
vocab_size=32000, |
|
padding_multiple=64, |
|
n_layer=18, |
|
n_head=32, |
|
n_embd=1024, |
|
rotary_percentage=1.0, |
|
parallel_residual=False, |
|
bias=False, |
|
_norm_class="RMSNorm", |
|
norm_eps=1e-5, |
|
_mlp_class="LLaMAMLP", |
|
intermediate_size=5632, |
|
n_query_groups=4, |
|
), |
|
] |
|
configs.extend(tiny_llama) |
|
|
|
|
|
name_to_config = {config["name"]: config for config in configs} |
|
|