jtatman's picture
added metadata
d5a99cf
|
raw
history blame
3.74 kB
---
language: en
tags:
- mistral
- gguf
- mistral-trimegistrus-7b
license:
- apache-2.0
datasets:
- pharaouk/dharma-1/dharma_1_mini.json
metrics:
- adam_beta1=0.9
- adam_beta2=0.999
- adam_epsilon=0.00000001
- add_cross_attention=false
- loss=1.4308836460113523
- runtime=11.3829
- samples_per_second=8.522
- steps_per_second=2.196
---
# Model Card for Model mistral-trimegistus-7b-gguf
This model repo holds gguf quantized versions of ["teknium/Mistral-Trismegistus-7B"] (https://huggingface.co/teknium/Mistral-Trismegistus-7B).
## Model Details
Transcendence is All You Need! Mistral Trismegistus is a model made for people interested in the esoteric, occult, and spiritual.
### Model Description
- The First Powerful Occult Expert Model: ~10,000 high quality, deep, rich, instructions on the occult, esoteric, and spiritual.
- Fast: Trained on Mistral, a state of the art 7B parameter model, you can run this model FAST on even a cpu.
- Not a positivity-nazi: This model was trained on all forms of esoteric tasks and knowledge, and is not burdened by the flowery nature of many other models, who chose positivity over creativity.
### Model Sources [optional]
All credits go [here](https://huggingface.co/teknium/Mistral-Trismegistus-7B)
## Usage
USER: <prompt>
ASSISTANT:
OR
<system message>
USER: <prompt>
ASSISTANT:
## Training Details
#### Training Hyperparameters
"_name_or_path": {
"desc": null,
"value": "mistralai/Mistral-7B-v0.1"
},
"architectures": {
"desc": null,
"value": [
"MistralForCausalLM"
]
},
"bad_words_ids": {
"desc": null,
"value": null
},
"bench_dataset": {
"desc": null,
"value": "pharaouk/dharma-1/dharma_1_mini.json"
},
"learning_rate": {
"desc": null,
"value": 0.0004
},
"max_grad_norm": {
"desc": null,
"value": 1
},
"fp16_opt_level": {
"desc": null,
"value": "O1"
},
"length_penalty": {
"desc": null,
"value": 1
},
"max_seq_length": {
"desc": null,
"value": 4096
},
"sliding_window": {
"desc": null,
"value": 4096
},
"num_beam_groups": {
"desc": null,
"value": 1
},
"initializer_range": {
"desc": null,
"value": 0.02
},
"intermediate_size": {
"desc": null,
"value": 14336
},
"lr_scheduler_type": {
"desc": null,
"value": "cosine"
},
"num_hidden_layers": {
"desc": null,
"value": 32
},
"repetition_penalty": {
"desc": null,
"value": 1
},
"evaluation_strategy": {
"desc": null,
"value": "steps"
},
"num_attention_heads": {
"desc": null,
"value": 32
},
"num_key_value_heads": {
"desc": null,
"value": 8
},
"quantization_config": {
"desc": null,
"value": {
"load_in_4bit": true,
"load_in_8bit": false,
"quant_method": "QuantizationMethod.BITS_AND_BYTES",
"llm_int8_threshold": 6,
"bnb_4bit_quant_type": "nf4",
"llm_int8_skip_modules": null,
"bnb_4bit_compute_dtype": "bfloat16",
"llm_int8_has_fp16_weight": false,
"bnb_4bit_use_double_quant": true,
"llm_int8_enable_fp32_cpu_offload": false
}
}
#### Speeds, Sizes, Times
{
"_step": 9589,
"_wandb.runtime": 12960,
"_runtime": 12960.192620515823,
"eval/loss": 1.4308836460113523,
"train/train_steps_per_second": 0.739,
"train/train_samples_per_second": 2.956,
"train/loss": 0.3396,
"train/epoch": 4,
"train/total_flos": 1757020072120942600,
"train/train_loss": 0.8929485179171377,
"train/learning_rate": 0,
"eval/steps_per_second": 2.196,
"_timestamp": 1696542775.2713604,
"eval/runtime": 11.3829,
"train/global_step": 9584,
"train/train_runtime": 12962.7813,
"eval/samples_per_second": 8.522
}