File size: 34,908 Bytes
5dadba4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "import math\n",
    "import multiprocessing\n",
    "import json\n",
    "import os\n",
    "import re\n",
    "import subprocess\n",
    "import yaml"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define base model name and default values for parameters\n",
    "path_to_llamacpp = '/Users/macdev/Downloads/build/bin'\n",
    "base_model_name = 'salamandra-2b'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_from_config(config_file):\n",
    "    \"\"\"Extract parameters like context size, rope frequency base, and other sampling settings from a config JSON file.\"\"\"\n",
    "    with open(config_file, 'r') as file:\n",
    "        config_data = json.load(file)\n",
    "\n",
    "    # Extract parameters if present\n",
    "    params = {}\n",
    "    params['ctx_size'] = config_data.get(\"max_position_embeddings\")  # Context size\n",
    "    params['rope_freq_base'] = config_data.get(\"rope_theta\")         # RoPE frequency base\n",
    "    params['rope_scaling'] = config_data.get(\"rope_scaling\")         # RoPE scaling factor\n",
    "    params['rope_scaling_type'] = config_data.get(\"rope_scaling_type\") # RoPE scaling type\n",
    "    params['torch_dtype'] = config_data.get(\"torch_dtype\")           # Torch data type\n",
    "    params['top_p'] = config_data.get(\"sampling.top_p\")              # Top-p sampling\n",
    "    params['temp'] = config_data.get(\"sampling.temperature\")         # Sampling temperature\n",
    "    params['repeat_penalty'] = config_data.get(\"sampling.repeat_penalty\") # Repetition penalty\n",
    "    params['repeat_last_n'] = config_data.get(\"sampling.repeat_last_n\")   # Last N tokens for repetition penalty\n",
    "    params['min_p'] = config_data.get(\"sampling.min_p\")              # Minimum probability sampling\n",
    "    params['top_k'] = config_data.get(\"sampling.top_k\")              # Top-k sampling\n",
    "    params['presence_penalty'] = config_data.get(\"sampling.presence_penalty\") # Presence penalty for repeat tokens\n",
    "    params['frequency_penalty'] = config_data.get(\"sampling.frequency_penalty\") # Frequency penalty for repeat tokens\n",
    "    params['mirostat'] = config_data.get(\"sampling.mirostat\")        # Mirostat sampling\n",
    "    params['mirostat_lr'] = config_data.get(\"sampling.mirostat_lr\")  # Mirostat learning rate\n",
    "    params['mirostat_ent'] = config_data.get(\"sampling.mirostat_ent\") # Mirostat entropy target\n",
    "    params['tfs'] = config_data.get(\"sampling.tfs\")                  # Tail free sampling\n",
    "    params['typical'] = config_data.get(\"sampling.typical\")          # Locally typical sampling\n",
    "\n",
    "    return params\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "unquantized = defaultdict(lambda: \"fp16\")\n",
    "unquantized[\"float32\"] = \"fp32\"\n",
    "unquantized[\"float16\"]   = \"fp16\"\n",
    "unquantized[\"bfloat16\"]   = \"bf16\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_from_generation_config(generation_config_file):\n",
    "    \"\"\"Extract generation-specific parameters relevant to llama-perplexity if available.\"\"\"\n",
    "    with open(generation_config_file, 'r') as file:\n",
    "        generation_data = json.load(file)\n",
    "    \n",
    "    # Extract and map only parameters useful for llama-perplexity\n",
    "    params = {}\n",
    "    params['top_p'] = generation_data.get(\"top_p\")                        # Top-p sampling\n",
    "    params['temp'] = generation_data.get(\"temperature\")                   # Sampling temperature\n",
    "    params['repeat_penalty'] = generation_data.get(\"repetition_penalty\")  # Repetition penalty\n",
    "    params['repeat_last_n'] = generation_data.get(\"repeat_last_n\")        # Last N tokens for repetition penalty\n",
    "    params['top_k'] = generation_data.get(\"top_k\")                        # Top-k sampling (if present)\n",
    "    params['presence_penalty'] = generation_data.get(\"presence_penalty\")  # Presence penalty\n",
    "    params['frequency_penalty'] = generation_data.get(\"frequency_penalty\")# Frequency penalty\n",
    "\n",
    "    # Remove None values to avoid overwriting defaults\n",
    "    params = {key: value for key, value in params.items() if value is not None}\n",
    "\n",
    "    return params\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_parameters(use_temp=False):\n",
    "    \"\"\"Retrieve parameters from the configuration files or use defaults, preferring generation_config if available.\"\"\"\n",
    "    # Initialize default parameters\n",
    "    config_params = dict()\n",
    "\n",
    "    # Extract parameters from config.json, if available\n",
    "    try:\n",
    "        config_params.update(extract_from_config('config.json'))\n",
    "    except FileNotFoundError:\n",
    "        print(\"config.json not found. Using default values.\")\n",
    "\n",
    "    # Extract parameters from generation_config.json, if available and prefer these values\n",
    "    try:\n",
    "        gen_params = extract_from_generation_config('generation_config.json')\n",
    "        # Update config_params with values from gen_params, if they are not None\n",
    "        for key, value in gen_params.items():\n",
    "            if value is not None:\n",
    "                config_params[key] = value\n",
    "    except FileNotFoundError:\n",
    "        print(\"generation_config.json not found. Using default generation values.\")\n",
    "\n",
    "    # Ensure that temperature ('temp') is never used\n",
    "    if 'temp' in config_params and use_temp is False:\n",
    "        config_params['temp'] = 0  # Set temperature to 0\n",
    "\n",
    "    return config_params\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ctx_size': 8192, 'rope_freq_base': 10000.0, 'rope_scaling': None, 'rope_scaling_type': None, 'torch_dtype': 'bfloat16', 'top_p': 0.95, 'temp': 0, 'repeat_penalty': 1.2, 'repeat_last_n': None, 'min_p': None, 'top_k': None, 'presence_penalty': None, 'frequency_penalty': None, 'mirostat': None, 'mirostat_lr': None, 'mirostat_ent': None, 'tfs': None, 'typical': None}\n"
     ]
    }
   ],
   "source": [
    "# Extract configuration parameters\n",
    "config_params = get_parameters()\n",
    "print(config_params)\n",
    "\n",
    "base_precision = unquantized[config_params[\"torch_dtype\"]]\n",
    "\n",
    "base_model = f'{base_model_name}_{base_precision}.gguf'\n",
    "base_perplexity_file = f\"perplexity_{base_precision}.txt\"\n",
    "\n",
    "threads = max(multiprocessing.cpu_count() - 1, 1)\n",
    "batch_size = 512\n",
    "ubatch_size = 128\n",
    "dataset_file = \"imatrix/oscar/imatrix-dataset.txt\"  \n",
    "ppl_file = \"ppl_test_data.txt\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quantization types:  ['IQ2_XS', 'IQ3_M', 'IQ3_S', 'IQ3_XS', 'IQ3_XXS', 'IQ4_NL', 'IQ4_XS', 'Q3_K_L', 'Q3_K_M', 'Q3_K_S', 'Q4_K_M', 'Q4_K_S', 'Q5_K_M', 'Q5_K_S', 'Q6_K', 'Q8_0', 'TQ1_0', 'TQ2_0']\n"
     ]
    }
   ],
   "source": [
    "# Load YAML file and extract quantization types\n",
    "yaml_file = 'quantizations.yaml'\n",
    "with open(yaml_file, 'r') as file:\n",
    "    data = yaml.safe_load(file)\n",
    "\n",
    "# Extract the list of quantization types\n",
    "quantization_types = data['quantizations']\n",
    "print(\"Quantization types: \", quantization_types)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Quantization parameters\n",
    "use_leave_output_tensor = True  # Set to False if you don't want to use --leave-output-tensor\n",
    "\n",
    "# Optional importance matrix path (set to None if you don't want to include --imatrix)\n",
    "imatrix_path = \"imatrix/oscar/imatrix.dat\"  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def quantize_model(\n",
    "    quantization_type, \n",
    "    base_model, \n",
    "    base_model_name, \n",
    "    path_to_llamacpp=\"\",\n",
    "    imatrix_path=None, \n",
    "    use_leave_output_tensor=True,\n",
    "    output_dir=\".\"\n",
    "):\n",
    "    \"\"\"\n",
    "    Quantize the base model into the specified quantization type.\n",
    "\n",
    "    Parameters:\n",
    "    - quantization_type (str): The type of quantization (e.g., \"Q4_0\", \"Q5_K_M\").\n",
    "    - base_model (str): Path to the base model file (e.g., \"salamandra-2b_bf16.gguf\").\n",
    "    - base_model_name (str): The base name of the model (e.g., \"salamandra-2b\").\n",
    "    - path_to_llamacpp (str): Path to the llama-quantize binary.\n",
    "    - imatrix_path (str, optional): Path to the importance matrix file. Default is None.\n",
    "    - use_leave_output_tensor (bool): Whether to include the --leave-output-tensor flag. Default is True.\n",
    "    - output_dir (str): Directory where the quantized models and logs will be saved. Default is current directory.\n",
    "\n",
    "    Returns:\n",
    "    - None\n",
    "    \"\"\"\n",
    "    # Construct the output model path\n",
    "    output_model = os.path.join(output_dir, f\"{base_model_name}_{quantization_type}.gguf\")\n",
    "\n",
    "    # Check if the quantized model already exists\n",
    "    if os.path.exists(output_model):\n",
    "        print(f\"Quantized model {output_model} already exists. Skipping quantization.\")\n",
    "        return\n",
    "\n",
    "    # Build the llama-quantize command\n",
    "    command_parts = [\n",
    "        os.path.join(path_to_llamacpp, \"llama-quantize\")\n",
    "    ]\n",
    "\n",
    "    # Conditionally add the --imatrix argument if the path is provided\n",
    "    if imatrix_path:\n",
    "        command_parts.append(f\"--imatrix {imatrix_path}\")\n",
    "\n",
    "    # Conditionally add the --leave-output-tensor argument based on the external boolean\n",
    "    if use_leave_output_tensor:\n",
    "        command_parts.append(\"--leave-output-tensor\")\n",
    "\n",
    "    # Add base model, output model, and quantization type\n",
    "    command_parts.extend([\n",
    "        f\"{base_model}\",\n",
    "        f\"\\\"{output_model}\\\"\",\n",
    "        f\"{quantization_type}\"\n",
    "    ])\n",
    "\n",
    "    # Redirect output to a log file for each quantization type\n",
    "    log_file = os.path.join(output_dir, f\"{quantization_type}_log.txt\")\n",
    "    command_parts.append(f\"> \\\"{log_file}\\\" 2>&1\")\n",
    "\n",
    "    # Join the command parts into a single command string\n",
    "    quantize_command = \" \".join(command_parts)\n",
    "\n",
    "    # Run the quantization command\n",
    "    print(f\"Quantizing model to {quantization_type} format with command: {quantize_command}\")\n",
    "    result = subprocess.run(quantize_command, shell=True, text=True)\n",
    "    if result.returncode != 0:\n",
    "        print(f\"Error during quantization to {quantization_type}. Check {log_file} for details.\")\n",
    "    else:\n",
    "        print(f\"Successfully quantized model to {quantization_type} and saved as {output_model}.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_command(command):\n",
    "    \"\"\"Function to run a command and capture output\"\"\"\n",
    "    print(f\"Running command: {command}\")\n",
    "    result = subprocess.run(command, shell=True, capture_output=True, text=True)\n",
    "    if result.returncode != 0:\n",
    "        print(f\"Error executing command: {result.stderr}\")\n",
    "    return result.stdout\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_perplexity(output):\n",
    "    \"\"\"extract perplexity from the output\"\"\"\n",
    "    match = re.search(r\"Final estimate: PPL = ([\\d.]+)\", output)\n",
    "    if match:\n",
    "        return float(match.group(1))\n",
    "    return None\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_command(model, output_file, ppl_file, config_params, threads=8, batch_size=512, ubatch_size=128):\n",
    "    \"\"\"Build the perplexity command based on the provided parameters.\"\"\"\n",
    "    command_parts = [\n",
    "        \"/Users/macdev/Downloads/build/bin/llama-perplexity\",\n",
    "        f\"-m {model}\",\n",
    "        f\"-f {ppl_file}\",\n",
    "        \"--perplexity\",\n",
    "    ]\n",
    "\n",
    "    # Add parameters only if they are set in config_params\n",
    "    if config_params.get('ctx_size') is not None:\n",
    "        command_parts.append(f\"--ctx-size {config_params['ctx_size']}\")\n",
    "    if config_params.get('rope_freq_base') is not None:\n",
    "        command_parts.append(f\"--rope-freq-base {config_params['rope_freq_base']}\")\n",
    "    if config_params.get('rope_freq_scale') is not None:\n",
    "        command_parts.append(f\"--rope-freq-scale {config_params['rope_freq_scale']}\")\n",
    "    if config_params.get('rope_scaling_type') is not None:\n",
    "        command_parts.append(f\"--rope-scaling {config_params['rope_scaling_type']}\")\n",
    "\n",
    "    # Add sampling-related parameters if they are set\n",
    "    if config_params.get('top_p') is not None:\n",
    "        command_parts.append(f\"--top-p {config_params['top_p']}\")\n",
    "    if config_params.get('repeat_penalty') is not None:\n",
    "        command_parts.append(f\"--repeat-penalty {config_params['repeat_penalty']}\")\n",
    "    if config_params.get('repeat_last_n') is not None:\n",
    "        command_parts.append(f\"--repeat-last-n {config_params['repeat_last_n']}\")\n",
    "\n",
    "    # Do not include `temp` as it's set to 0 in `get_parameters` if `use_temp` is False\n",
    "    # Only add if temp is non-zero (if `use_temp` is True in get_parameters)\n",
    "    if config_params.get('temp') is not None and config_params['temp'] != 0:\n",
    "        command_parts.append(f\"--temp {config_params['temp']}\")\n",
    "\n",
    "    # Add fixed parameters for threads and batch sizes\n",
    "    command_parts.extend([\n",
    "        f\"--threads {threads}\",\n",
    "        f\"--batch-size {batch_size}\",\n",
    "        f\"--ubatch-size {ubatch_size}\",\n",
    "    ])\n",
    "\n",
    "    # Redirect output to file\n",
    "    command = \" \".join(command_parts) + f\" > {output_file} 2>&1\"\n",
    "    return command\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Measure perplexity for the base model\n",
    "if os.path.exists(f'perplexity_{base_precision}.txt'):\n",
    "        with open(base_perplexity_file, 'r') as file:\n",
    "                base_output = file.read()\n",
    "else:\n",
    "        base_command = build_command(base_model, base_perplexity_file, ppl_file, config_params=config_params, threads=threads, batch_size=batch_size, ubatch_size=        ubatch_size)\n",
    "        base_output = run_command(base_command)\n",
    "base_perplexity = extract_perplexity(base_output)\n",
    "calculated_perplexity_recently = False # This will be set to True later"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quantizing model to IQ2_XS format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_IQ2_XS.gguf\" IQ2_XS > \"./IQ2_XS_log.txt\" 2>&1\n",
      "Successfully quantized model to IQ2_XS and saved as ./salamandra-2b_IQ2_XS.gguf.\n",
      "Quantizing model to IQ3_M format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_IQ3_M.gguf\" IQ3_M > \"./IQ3_M_log.txt\" 2>&1\n",
      "Successfully quantized model to IQ3_M and saved as ./salamandra-2b_IQ3_M.gguf.\n",
      "Quantizing model to IQ3_S format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_IQ3_S.gguf\" IQ3_S > \"./IQ3_S_log.txt\" 2>&1\n",
      "Successfully quantized model to IQ3_S and saved as ./salamandra-2b_IQ3_S.gguf.\n",
      "Quantizing model to IQ3_XS format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_IQ3_XS.gguf\" IQ3_XS > \"./IQ3_XS_log.txt\" 2>&1\n",
      "Successfully quantized model to IQ3_XS and saved as ./salamandra-2b_IQ3_XS.gguf.\n",
      "Quantizing model to IQ3_XXS format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_IQ3_XXS.gguf\" IQ3_XXS > \"./IQ3_XXS_log.txt\" 2>&1\n",
      "Successfully quantized model to IQ3_XXS and saved as ./salamandra-2b_IQ3_XXS.gguf.\n",
      "Quantizing model to IQ4_NL format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_IQ4_NL.gguf\" IQ4_NL > \"./IQ4_NL_log.txt\" 2>&1\n",
      "Successfully quantized model to IQ4_NL and saved as ./salamandra-2b_IQ4_NL.gguf.\n",
      "Quantizing model to IQ4_XS format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_IQ4_XS.gguf\" IQ4_XS > \"./IQ4_XS_log.txt\" 2>&1\n",
      "Successfully quantized model to IQ4_XS and saved as ./salamandra-2b_IQ4_XS.gguf.\n",
      "Quantizing model to Q3_K_L format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_Q3_K_L.gguf\" Q3_K_L > \"./Q3_K_L_log.txt\" 2>&1\n",
      "Successfully quantized model to Q3_K_L and saved as ./salamandra-2b_Q3_K_L.gguf.\n",
      "Quantizing model to Q3_K_M format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_Q3_K_M.gguf\" Q3_K_M > \"./Q3_K_M_log.txt\" 2>&1\n",
      "Successfully quantized model to Q3_K_M and saved as ./salamandra-2b_Q3_K_M.gguf.\n",
      "Quantizing model to Q3_K_S format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_Q3_K_S.gguf\" Q3_K_S > \"./Q3_K_S_log.txt\" 2>&1\n",
      "Successfully quantized model to Q3_K_S and saved as ./salamandra-2b_Q3_K_S.gguf.\n",
      "Quantizing model to Q4_K_M format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_Q4_K_M.gguf\" Q4_K_M > \"./Q4_K_M_log.txt\" 2>&1\n",
      "Successfully quantized model to Q4_K_M and saved as ./salamandra-2b_Q4_K_M.gguf.\n",
      "Quantizing model to Q4_K_S format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_Q4_K_S.gguf\" Q4_K_S > \"./Q4_K_S_log.txt\" 2>&1\n",
      "Successfully quantized model to Q4_K_S and saved as ./salamandra-2b_Q4_K_S.gguf.\n",
      "Quantizing model to Q5_K_M format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_Q5_K_M.gguf\" Q5_K_M > \"./Q5_K_M_log.txt\" 2>&1\n",
      "Successfully quantized model to Q5_K_M and saved as ./salamandra-2b_Q5_K_M.gguf.\n",
      "Quantizing model to Q5_K_S format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_Q5_K_S.gguf\" Q5_K_S > \"./Q5_K_S_log.txt\" 2>&1\n",
      "Successfully quantized model to Q5_K_S and saved as ./salamandra-2b_Q5_K_S.gguf.\n",
      "Quantizing model to Q6_K format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_Q6_K.gguf\" Q6_K > \"./Q6_K_log.txt\" 2>&1\n",
      "Successfully quantized model to Q6_K and saved as ./salamandra-2b_Q6_K.gguf.\n",
      "Quantizing model to Q8_0 format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_Q8_0.gguf\" Q8_0 > \"./Q8_0_log.txt\" 2>&1\n",
      "Successfully quantized model to Q8_0 and saved as ./salamandra-2b_Q8_0.gguf.\n",
      "Quantizing model to TQ1_0 format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_TQ1_0.gguf\" TQ1_0 > \"./TQ1_0_log.txt\" 2>&1\n",
      "Error during quantization to TQ1_0. Check ./TQ1_0_log.txt for details.\n",
      "Quantizing model to TQ2_0 format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_TQ2_0.gguf\" TQ2_0 > \"./TQ2_0_log.txt\" 2>&1\n",
      "Error during quantization to TQ2_0. Check ./TQ2_0_log.txt for details.\n"
     ]
    }
   ],
   "source": [
    "# Quantize the models\n",
    "for quant in quantization_types:\n",
    "    quantize_model(\n",
    "        quantization_type=quant,\n",
    "        base_model=base_model,\n",
    "        base_model_name=base_model_name,\n",
    "        path_to_llamacpp=path_to_llamacpp,\n",
    "        imatrix_path=imatrix_path,\n",
    "        use_leave_output_tensor=use_leave_output_tensor,\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_IQ2_XS.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_IQ2_XS.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_IQ3_M.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_IQ3_M.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_IQ3_S.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_IQ3_S.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_IQ3_XS.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_IQ3_XS.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_IQ3_XXS.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_IQ3_XXS.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_IQ4_NL.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_IQ4_NL.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_IQ4_XS.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_IQ4_XS.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_Q3_K_L.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q3_K_L.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_Q3_K_M.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q3_K_M.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_Q3_K_S.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q3_K_S.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_Q4_K_M.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q4_K_M.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_Q4_K_S.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q4_K_S.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_Q5_K_M.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q5_K_M.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_Q5_K_S.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q5_K_S.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_Q6_K.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q6_K.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_Q8_0.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q8_0.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_TQ1_0.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_TQ1_0.txt 2>&1\n",
      "Error executing command: \n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_TQ2_0.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_TQ2_0.txt 2>&1\n",
      "Error executing command: \n"
     ]
    }
   ],
   "source": [
    "# Measure perplexity for each quantized model\n",
    "perplexity_results = dict()\n",
    "perplexity_results[base_precision] = base_perplexity\n",
    "for quant in quantization_types:\n",
    "    calculated_perplexity_recently = True\n",
    "    \n",
    "    model = f\"{base_model_name}_{quant}.gguf\"\n",
    "    output_file = f\"perplexity_{quant}.txt\"\n",
    "\n",
    "    command = build_command(model, output_file, ppl_file, config_params=config_params, threads=threads, batch_size=batch_size, ubatch_size=        ubatch_size)\n",
    "    output = run_command(command)\n",
    "\n",
    "    perplexity = extract_perplexity(output)\n",
    "    perplexity_results[quant] = perplexity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load previous measurements if we didnt just measure perplexity for each quantized model\n",
    "if not calculated_perplexity_recently:\n",
    "    perplexity_results = dict()\n",
    "    perplexity_results[base_precision] = base_perplexity\n",
    "\n",
    "    for quant in quantization_types:\n",
    "        output_file = f\"perplexity_{quant}.txt\"\n",
    "        try:\n",
    "            with open(output_file, 'r') as file:\n",
    "                output = file.read()\n",
    "            perplexity = extract_perplexity(output)\n",
    "        except FileNotFoundError:\n",
    "            print(f\"Output file {output_file} not found.\")\n",
    "            perplexity = None\n",
    "\n",
    "        perplexity_results[quant] = perplexity\n",
    "\n",
    "    # Calculate ln(PPL(Q)/PPL(fp16)) and generate the table\n",
    "    print(\"\\nPerplexity Comparison Table:\")\n",
    "    print(f\"{'Quantization Type':<20} {'PPL(Q)':<10} {'ln(PPL(Q)/PPL(fp16))':<25}\")\n",
    "    print(\"=\" * 55)\n",
    "    for quant, ppl in perplexity_results.items():\n",
    "        if ppl and base_perplexity:\n",
    "            ln_ratio = round(math.log(ppl / base_perplexity), 6)\n",
    "            print(f\"{quant:<20} {ppl:<10} {ln_ratio:<25}\")\n",
    "\n",
    "    print(perplexity_results)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate ln(PPL(Q)/PPL(fp16)) and generate the table\n",
    "print(\"\\nPerplexity Comparison Table:\")\n",
    "print(f\"{'Quantization Type':<20} {'PPL(Q)':<10} {'ln(PPL(Q)/PPL(fp16))':<25}\")\n",
    "print(\"=\" * 55)\n",
    "for quant, ppl in perplexity_results.items():\n",
    "    if ppl and base_perplexity:\n",
    "        ln_ratio = round(math.log(ppl / base_perplexity), 6)\n",
    "        print(f\"{quant:<20} {ppl:<10} {ln_ratio:<25}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Output file perplexity_TQ1_0.txt not found.\n",
      "Output file perplexity_TQ2_0.txt not found.\n",
      "\n",
      "Perplexity Comparison Table:\n",
      "Quantization Type    PPL(Q)     ln(PPL(Q)/PPL(fp16))     \n",
      "=======================================================\n",
      "bf16                 14.0431    0.0                      \n",
      "IQ2_XS               28.9052    0.72189                  \n",
      "IQ3_M                15.1995    0.079131                 \n",
      "IQ3_S                15.8627    0.121839                 \n",
      "IQ3_XS               16.7197    0.174456                 \n",
      "IQ3_XXS              17.6216    0.226994                 \n",
      "IQ4_NL               14.5534    0.035693                 \n",
      "IQ4_XS               14.5638    0.036408                 \n",
      "Q3_K_L               15.0444    0.068875                 \n",
      "Q3_K_M               15.2582    0.082986                 \n",
      "Q3_K_S               15.839     0.120344                 \n",
      "Q4_K_M               14.399     0.025028                 \n",
      "Q4_K_S               14.4338    0.027442                 \n",
      "Q5_K_M               14.1299    0.006162                 \n",
      "Q5_K_S               14.1497    0.007562                 \n",
      "Q6_K                 14.0675    0.001736                 \n",
      "Q8_0                 14.0495    0.000456                 \n",
      "{'bf16': 14.0431, 'IQ2_XS': 28.9052, 'IQ3_M': 15.1995, 'IQ3_S': 15.8627, 'IQ3_XS': 16.7197, 'IQ3_XXS': 17.6216, 'IQ4_NL': 14.5534, 'IQ4_XS': 14.5638, 'Q3_K_L': 15.0444, 'Q3_K_M': 15.2582, 'Q3_K_S': 15.839, 'Q4_K_M': 14.399, 'Q4_K_S': 14.4338, 'Q5_K_M': 14.1299, 'Q5_K_S': 14.1497, 'Q6_K': 14.0675, 'Q8_0': 14.0495, 'TQ1_0': None, 'TQ2_0': None}\n"
     ]
    }
   ],
   "source": [
    "perplexity_results = dict()\n",
    "perplexity_results[base_precision] = base_perplexity\n",
    "\n",
    "for quant in quantization_types:\n",
    "    output_file = f\"perplexity_{quant}.txt\"\n",
    "    try:\n",
    "        with open(output_file, 'r') as file:\n",
    "            output = file.read()\n",
    "        perplexity = extract_perplexity(output)\n",
    "    except FileNotFoundError:\n",
    "        print(f\"Output file {output_file} not found.\")\n",
    "        perplexity = None\n",
    "\n",
    "    perplexity_results[quant] = perplexity\n",
    "\n",
    "# Calculate ln(PPL(Q)/PPL(fp16)) and generate the table\n",
    "print(\"\\nPerplexity Comparison Table:\")\n",
    "print(f\"{'Quantization Type':<20} {'PPL(Q)':<10} {'ln(PPL(Q)/PPL(fp16))':<25}\")\n",
    "print(\"=\" * 55)\n",
    "for quant, ppl in perplexity_results.items():\n",
    "    if ppl and base_perplexity:\n",
    "        ln_ratio = round(math.log(ppl / base_perplexity), 6)\n",
    "        print(f\"{quant:<20} {ppl:<10} {ln_ratio:<25}\")\n",
    "\n",
    "print(perplexity_results)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}