add model

Files changed (5) hide show

README.md CHANGED Viewed

@@ -1,5 +1,18 @@
-TODO: Complete
-This architecture takes GQA and tied embeddings to create an effeceint 5B model
-This uses a mix of data yet to be published

+### Micro Mistral
+This is a small mistral model with 6 layers
+It is similar to smol llama varaints uses GQA and tied embeddings.
+Except it uses mistral style arch with GQA and sliding window attention
+This architecture takes GQA and tied embeddings to create an effeceint 0.5B model that uses the mistral architecture(It is supported in downstream applications)
+#### Dataset
+Minipile
+Instruct
+Math
+OpenOrca
+Synthetic Data
+TODO: Complete Dataset section

config.json CHANGED Viewed

@@ -25,6 +25,5 @@
   "torch_dtype": "bfloat16",
   "transformers_version": "4.34.1",
   "use_cache": true,
-  "vocab_size": 50304,
-}

   "torch_dtype": "bfloat16",
   "transformers_version": "4.34.1",
   "use_cache": true,
+  "vocab_size": 50304
+}

init_model.py ADDED Viewed

+import torch
+from transformers import AutoConfig, AutoModelForCausalLM
+# Load the configuration and initialize the model
+config_path = "config.json"  # Adjust path as necessary
+config = AutoConfig.from_pretrained(config_path)
+model = AutoModelForCausalLM.from_config(config)
+# Reinitialize weights with a standard deviation of 0.02 for a more controlled initialization
+def reinitialize_weights(module):
+    if hasattr(module, "weight") and not isinstance(module, torch.nn.LayerNorm):
+        torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    if hasattr(module, "bias") and module.bias is not None:
+        torch.nn.init.constant_(module.bias, 0.0)
+model.apply(reinitialize_weights)
+# Cast the model's parameters to bf16
+model = model.to(
+    dtype=torch.bfloat16
+)  # Converts all floating point parameters to bfloat16
+# Save the model with SafeTensors
+model.save_pretrained("./model_bf16.safetensors", save_in_safe_tensors_format=True)

model_bf16.safetensors/config.json ADDED Viewed

+{
+  "_name_or_path": "config.json",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "dropout_p": 0.1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 4096,
+  "model_type": "llama",
+  "num_attention_heads": 128,
+  "num_hidden_layers": 6,
+  "num_key_value_heads": 16,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "sliding_window": 1024,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.36.2",
+  "use_cache": true,
+  "vocab_size": 50304
+}

model_bf16.safetensors/generation_config.json ADDED Viewed

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.36.2"
+}