jtatman commited on
Commit
d1e5b17
1 Parent(s): 99e09d9

initial push

Browse files
Files changed (1) hide show
  1. README.md +153 -0
README.md ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model Card for Model mistral-trimegistus-7b-gguf
2
+
3
+ This model repo holds gguf quantized versions of ["teknium/Mistral-Trismegistus-7B"] (https://huggingface.co/teknium/Mistral-Trismegistus-7B).
4
+
5
+ ## Model Details
6
+
7
+ Transcendence is All You Need! Mistral Trismegistus is a model made for people interested in the esoteric, occult, and spiritual.
8
+
9
+ ### Model Description
10
+
11
+
12
+ - The First Powerful Occult Expert Model: ~10,000 high quality, deep, rich, instructions on the occult, esoteric, and spiritual.
13
+
14
+ - Fast: Trained on Mistral, a state of the art 7B parameter model, you can run this model FAST on even a cpu.
15
+ - Not a positivity-nazi: This model was trained on all forms of esoteric tasks and knowledge, and is not burdened by the flowery nature of many other models, who chose positivity over creativity.
16
+
17
+
18
+ ### Model Sources [optional]
19
+
20
+ All credits go [here](https://huggingface.co/teknium/Mistral-Trismegistus-7B)
21
+
22
+ ## Usage
23
+
24
+ USER: <prompt>
25
+ ASSISTANT:
26
+
27
+ OR
28
+
29
+ <system message>
30
+ USER: <prompt>
31
+ ASSISTANT:
32
+
33
+
34
+ ## Training Details
35
+
36
+ #### Training Hyperparameters
37
+
38
+ "_name_or_path": {
39
+ "desc": null,
40
+ "value": "mistralai/Mistral-7B-v0.1"
41
+ },
42
+ "architectures": {
43
+ "desc": null,
44
+ "value": [
45
+ "MistralForCausalLM"
46
+ ]
47
+ },
48
+ "bad_words_ids": {
49
+ "desc": null,
50
+ "value": null
51
+ },
52
+ "bench_dataset": {
53
+ "desc": null,
54
+ "value": "pharaouk/dharma-1/dharma_1_mini.json"
55
+ },
56
+ "learning_rate": {
57
+ "desc": null,
58
+ "value": 0.0004
59
+ },
60
+ "max_grad_norm": {
61
+ "desc": null,
62
+ "value": 1
63
+ },
64
+ "fp16_opt_level": {
65
+ "desc": null,
66
+ "value": "O1"
67
+ },
68
+ "length_penalty": {
69
+ "desc": null,
70
+ "value": 1
71
+ },
72
+ "max_seq_length": {
73
+ "desc": null,
74
+ "value": 4096
75
+ },
76
+ "sliding_window": {
77
+ "desc": null,
78
+ "value": 4096
79
+ },
80
+ "num_beam_groups": {
81
+ "desc": null,
82
+ "value": 1
83
+ },
84
+ "initializer_range": {
85
+ "desc": null,
86
+ "value": 0.02
87
+ },
88
+ "intermediate_size": {
89
+ "desc": null,
90
+ "value": 14336
91
+ },
92
+ "lr_scheduler_type": {
93
+ "desc": null,
94
+ "value": "cosine"
95
+ },
96
+ "num_hidden_layers": {
97
+ "desc": null,
98
+ "value": 32
99
+ },
100
+ "repetition_penalty": {
101
+ "desc": null,
102
+ "value": 1
103
+ },
104
+ "evaluation_strategy": {
105
+ "desc": null,
106
+ "value": "steps"
107
+ },
108
+ "num_attention_heads": {
109
+ "desc": null,
110
+ "value": 32
111
+ },
112
+ "num_key_value_heads": {
113
+ "desc": null,
114
+ "value": 8
115
+ },
116
+ "quantization_config": {
117
+ "desc": null,
118
+ "value": {
119
+ "load_in_4bit": true,
120
+ "load_in_8bit": false,
121
+ "quant_method": "QuantizationMethod.BITS_AND_BYTES",
122
+ "llm_int8_threshold": 6,
123
+ "bnb_4bit_quant_type": "nf4",
124
+ "llm_int8_skip_modules": null,
125
+ "bnb_4bit_compute_dtype": "bfloat16",
126
+ "llm_int8_has_fp16_weight": false,
127
+ "bnb_4bit_use_double_quant": true,
128
+ "llm_int8_enable_fp32_cpu_offload": false
129
+ }
130
+ }
131
+
132
+
133
+ #### Speeds, Sizes, Times
134
+
135
+ {
136
+ "_step": 9589,
137
+ "_wandb.runtime": 12960,
138
+ "_runtime": 12960.192620515823,
139
+ "eval/loss": 1.4308836460113523,
140
+ "train/train_steps_per_second": 0.739,
141
+ "train/train_samples_per_second": 2.956,
142
+ "train/loss": 0.3396,
143
+ "train/epoch": 4,
144
+ "train/total_flos": 1757020072120942600,
145
+ "train/train_loss": 0.8929485179171377,
146
+ "train/learning_rate": 0,
147
+ "eval/steps_per_second": 2.196,
148
+ "_timestamp": 1696542775.2713604,
149
+ "eval/runtime": 11.3829,
150
+ "train/global_step": 9584,
151
+ "train/train_runtime": 12962.7813,
152
+ "eval/samples_per_second": 8.522
153
+ }