jtatman
/

mistral-trismegistus-7b-gguf

+# Model Card for Model mistral-trimegistus-7b-gguf
+This model repo holds gguf quantized versions of ["teknium/Mistral-Trismegistus-7B"] (https://huggingface.co/teknium/Mistral-Trismegistus-7B).
+## Model Details
+Transcendence is All You Need! Mistral Trismegistus is a model made for people interested in the esoteric, occult, and spiritual.
+### Model Description
+- The First Powerful Occult Expert Model: ~10,000 high quality, deep, rich, instructions on the occult, esoteric, and spiritual.
+- Fast: Trained on Mistral, a state of the art 7B parameter model, you can run this model FAST on even a cpu.
+- Not a positivity-nazi: This model was trained on all forms of esoteric tasks and knowledge, and is not burdened by the flowery nature of many other models, who chose positivity over creativity.
+### Model Sources [optional]
+All credits go [here](https://huggingface.co/teknium/Mistral-Trismegistus-7B)
+## Usage
+USER: <prompt>
+ASSISTANT:
+OR
+<system message>
+USER: <prompt>
+ASSISTANT:
+## Training Details
+#### Training Hyperparameters
+ "_name_or_path": {
+    "desc": null,
+    "value": "mistralai/Mistral-7B-v0.1"
+  },
+  "architectures": {
+    "desc": null,
+    "value": [
+      "MistralForCausalLM"
+    ]
+  },
+  "bad_words_ids": {
+    "desc": null,
+    "value": null
+  },
+  "bench_dataset": {
+    "desc": null,
+    "value": "pharaouk/dharma-1/dharma_1_mini.json"
+  },
+  "learning_rate": {
+    "desc": null,
+    "value": 0.0004
+  },
+  "max_grad_norm": {
+    "desc": null,
+    "value": 1
+  },
+  "fp16_opt_level": {
+    "desc": null,
+    "value": "O1"
+  },
+  "length_penalty": {
+    "desc": null,
+    "value": 1
+  },
+  "max_seq_length": {
+    "desc": null,
+    "value": 4096
+  },
+  "sliding_window": {
+    "desc": null,
+    "value": 4096
+  },
+  "num_beam_groups": {
+    "desc": null,
+    "value": 1
+  },
+  "initializer_range": {
+    "desc": null,
+    "value": 0.02
+  },
+  "intermediate_size": {
+    "desc": null,
+    "value": 14336
+  },
+  "lr_scheduler_type": {
+    "desc": null,
+    "value": "cosine"
+  },
+  "num_hidden_layers": {
+    "desc": null,
+    "value": 32
+  },
+  "repetition_penalty": {
+    "desc": null,
+    "value": 1
+  },
+  "evaluation_strategy": {
+    "desc": null,
+    "value": "steps"
+  },
+  "num_attention_heads": {
+    "desc": null,
+    "value": 32
+  },
+  "num_key_value_heads": {
+    "desc": null,
+    "value": 8
+  },
+  "quantization_config": {
+    "desc": null,
+    "value": {
+      "load_in_4bit": true,
+      "load_in_8bit": false,
+      "quant_method": "QuantizationMethod.BITS_AND_BYTES",
+      "llm_int8_threshold": 6,
+      "bnb_4bit_quant_type": "nf4",
+      "llm_int8_skip_modules": null,
+      "bnb_4bit_compute_dtype": "bfloat16",
+      "llm_int8_has_fp16_weight": false,
+      "bnb_4bit_use_double_quant": true,
+      "llm_int8_enable_fp32_cpu_offload": false
+    }
+  }
+#### Speeds, Sizes, Times
+{
+  "_step": 9589,
+  "_wandb.runtime": 12960,
+  "_runtime": 12960.192620515823,
+  "eval/loss": 1.4308836460113523,
+  "train/train_steps_per_second": 0.739,
+  "train/train_samples_per_second": 2.956,
+  "train/loss": 0.3396,
+  "train/epoch": 4,
+  "train/total_flos": 1757020072120942600,
+  "train/train_loss": 0.8929485179171377,
+  "train/learning_rate": 0,
+  "eval/steps_per_second": 2.196,
+  "_timestamp": 1696542775.2713604,
+  "eval/runtime": 11.3829,
+  "train/global_step": 9584,
+  "train/train_runtime": 12962.7813,
+  "eval/samples_per_second": 8.522
+}