|
--- |
|
language: en |
|
tags: |
|
- mistral |
|
- gguf |
|
- mistral-trimegistrus-7b |
|
license: |
|
- apache-2.0 |
|
datasets: |
|
- pharaouk/dharma-1/dharma_1_mini.json |
|
metrics: |
|
- adam_beta1=0.9 |
|
- adam_beta2=0.999 |
|
- adam_epsilon=0.00000001 |
|
- add_cross_attention=false |
|
- loss=1.4308836460113523 |
|
- runtime=11.3829 |
|
- samples_per_second=8.522 |
|
- steps_per_second=2.196 |
|
|
|
--- |
|
|
|
# Model Card for Model mistral-trimegistus-7b-gguf |
|
|
|
This model repo holds gguf quantized versions of ["teknium/Mistral-Trismegistus-7B"] (https://huggingface.co/teknium/Mistral-Trismegistus-7B). |
|
|
|
## Model Details |
|
|
|
Transcendence is All You Need! Mistral Trismegistus is a model made for people interested in the esoteric, occult, and spiritual. |
|
|
|
### Model Description |
|
|
|
|
|
- The First Powerful Occult Expert Model: ~10,000 high quality, deep, rich, instructions on the occult, esoteric, and spiritual. |
|
|
|
- Fast: Trained on Mistral, a state of the art 7B parameter model, you can run this model FAST on even a cpu. |
|
- Not a positivity-nazi: This model was trained on all forms of esoteric tasks and knowledge, and is not burdened by the flowery nature of many other models, who chose positivity over creativity. |
|
|
|
|
|
### Model Sources [optional] |
|
|
|
All credits go [here](https://huggingface.co/teknium/Mistral-Trismegistus-7B) |
|
|
|
## Usage |
|
|
|
USER: <prompt> |
|
ASSISTANT: |
|
|
|
OR |
|
|
|
<system message> |
|
USER: <prompt> |
|
ASSISTANT: |
|
|
|
|
|
## Training Details |
|
|
|
#### Training Hyperparameters |
|
|
|
"_name_or_path": { |
|
"desc": null, |
|
"value": "mistralai/Mistral-7B-v0.1" |
|
}, |
|
"architectures": { |
|
"desc": null, |
|
"value": [ |
|
"MistralForCausalLM" |
|
] |
|
}, |
|
"bad_words_ids": { |
|
"desc": null, |
|
"value": null |
|
}, |
|
"bench_dataset": { |
|
"desc": null, |
|
"value": "pharaouk/dharma-1/dharma_1_mini.json" |
|
}, |
|
"learning_rate": { |
|
"desc": null, |
|
"value": 0.0004 |
|
}, |
|
"max_grad_norm": { |
|
"desc": null, |
|
"value": 1 |
|
}, |
|
"fp16_opt_level": { |
|
"desc": null, |
|
"value": "O1" |
|
}, |
|
"length_penalty": { |
|
"desc": null, |
|
"value": 1 |
|
}, |
|
"max_seq_length": { |
|
"desc": null, |
|
"value": 4096 |
|
}, |
|
"sliding_window": { |
|
"desc": null, |
|
"value": 4096 |
|
}, |
|
"num_beam_groups": { |
|
"desc": null, |
|
"value": 1 |
|
}, |
|
"initializer_range": { |
|
"desc": null, |
|
"value": 0.02 |
|
}, |
|
"intermediate_size": { |
|
"desc": null, |
|
"value": 14336 |
|
}, |
|
"lr_scheduler_type": { |
|
"desc": null, |
|
"value": "cosine" |
|
}, |
|
"num_hidden_layers": { |
|
"desc": null, |
|
"value": 32 |
|
}, |
|
"repetition_penalty": { |
|
"desc": null, |
|
"value": 1 |
|
}, |
|
"evaluation_strategy": { |
|
"desc": null, |
|
"value": "steps" |
|
}, |
|
"num_attention_heads": { |
|
"desc": null, |
|
"value": 32 |
|
}, |
|
"num_key_value_heads": { |
|
"desc": null, |
|
"value": 8 |
|
}, |
|
"quantization_config": { |
|
"desc": null, |
|
"value": { |
|
"load_in_4bit": true, |
|
"load_in_8bit": false, |
|
"quant_method": "QuantizationMethod.BITS_AND_BYTES", |
|
"llm_int8_threshold": 6, |
|
"bnb_4bit_quant_type": "nf4", |
|
"llm_int8_skip_modules": null, |
|
"bnb_4bit_compute_dtype": "bfloat16", |
|
"llm_int8_has_fp16_weight": false, |
|
"bnb_4bit_use_double_quant": true, |
|
"llm_int8_enable_fp32_cpu_offload": false |
|
} |
|
} |
|
|
|
|
|
#### Speeds, Sizes, Times |
|
|
|
{ |
|
"_step": 9589, |
|
"_wandb.runtime": 12960, |
|
"_runtime": 12960.192620515823, |
|
"eval/loss": 1.4308836460113523, |
|
"train/train_steps_per_second": 0.739, |
|
"train/train_samples_per_second": 2.956, |
|
"train/loss": 0.3396, |
|
"train/epoch": 4, |
|
"train/total_flos": 1757020072120942600, |
|
"train/train_loss": 0.8929485179171377, |
|
"train/learning_rate": 0, |
|
"eval/steps_per_second": 2.196, |
|
"_timestamp": 1696542775.2713604, |
|
"eval/runtime": 11.3829, |
|
"train/global_step": 9584, |
|
"train/train_runtime": 12962.7813, |
|
"eval/samples_per_second": 8.522 |
|
} |
|
|