File size: 34,908 Bytes

5dadba4

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "import math\n",
    "import multiprocessing\n",
    "import json\n",
    "import os\n",
    "import re\n",
    "import subprocess\n",
    "import yaml"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define base model name and default values for parameters\n",
    "path_to_llamacpp = '/Users/macdev/Downloads/build/bin'\n",
    "base_model_name = 'salamandra-2b'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_from_config(config_file):\n",
    "    \"\"\"Extract parameters like context size, rope frequency base, and other sampling settings from a config JSON file.\"\"\"\n",
    "    with open(config_file, 'r') as file:\n",
    "        config_data = json.load(file)\n",
    "\n",
    "    # Extract parameters if present\n",
    "    params = {}\n",
    "    params['ctx_size'] = config_data.get(\"max_position_embeddings\")  # Context size\n",
    "    params['rope_freq_base'] = config_data.get(\"rope_theta\")         # RoPE frequency base\n",
    "    params['rope_scaling'] = config_data.get(\"rope_scaling\")         # RoPE scaling factor\n",
    "    params['rope_scaling_type'] = config_data.get(\"rope_scaling_type\") # RoPE scaling type\n",
    "    params['torch_dtype'] = config_data.get(\"torch_dtype\")           # Torch data type\n",
    "    params['top_p'] = config_data.get(\"sampling.top_p\")              # Top-p sampling\n",
    "    params['temp'] = config_data.get(\"sampling.temperature\")         # Sampling temperature\n",
    "    params['repeat_penalty'] = config_data.get(\"sampling.repeat_penalty\") # Repetition penalty\n",
    "    params['repeat_last_n'] = config_data.get(\"sampling.repeat_last_n\")   # Last N tokens for repetition penalty\n",
    "    params['min_p'] = config_data.get(\"sampling.min_p\")              # Minimum probability sampling\n",
    "    params['top_k'] = config_data.get(\"sampling.top_k\")              # Top-k sampling\n",
    "    params['presence_penalty'] = config_data.get(\"sampling.presence_penalty\") # Presence penalty for repeat tokens\n",
    "    params['frequency_penalty'] = config_data.get(\"sampling.frequency_penalty\") # Frequency penalty for repeat tokens\n",
    "    params['mirostat'] = config_data.get(\"sampling.mirostat\")        # Mirostat sampling\n",
    "    params['mirostat_lr'] = config_data.get(\"sampling.mirostat_lr\")  # Mirostat learning rate\n",
    "    params['mirostat_ent'] = config_data.get(\"sampling.mirostat_ent\") # Mirostat entropy target\n",
    "    params['tfs'] = config_data.get(\"sampling.tfs\")                  # Tail free sampling\n",
    "    params['typical'] = config_data.get(\"sampling.typical\")          # Locally typical sampling\n",
    "\n",
    "    return params\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "unquantized = defaultdict(lambda: \"fp16\")\n",
    "unquantized[\"float32\"] = \"fp32\"\n",
    "unquantized[\"float16\"]   = \"fp16\"\n",
    "unquantized[\"bfloat16\"]   = \"bf16\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_from_generation_config(generation_config_file):\n",
    "    \"\"\"Extract generation-specific parameters relevant to llama-perplexity if available.\"\"\"\n",
    "    with open(generation_config_file, 'r') as file:\n",
    "        generation_data = json.load(file)\n",
    "    \n",
    "    # Extract and map only parameters useful for llama-perplexity\n",
    "    params = {}\n",
    "    params['top_p'] = generation_data.get(\"top_p\")                        # Top-p sampling\n",
    "    params['temp'] = generation_data.get(\"temperature\")                   # Sampling temperature\n",
    "    params['repeat_penalty'] = generation_data.get(\"repetition_penalty\")  # Repetition penalty\n",
    "    params['repeat_last_n'] = generation_data.get(\"repeat_last_n\")        # Last N tokens for repetition penalty\n",
    "    params['top_k'] = generation_data.get(\"top_k\")                        # Top-k sampling (if present)\n",
    "    params['presence_penalty'] = generation_data.get(\"presence_penalty\")  # Presence penalty\n",
    "    params['frequency_penalty'] = generation_data.get(\"frequency_penalty\")# Frequency penalty\n",
    "\n",
    "    # Remove None values to avoid overwriting defaults\n",
    "    params = {key: value for key, value in params.items() if value is not None}\n",
    "\n",
    "    return params\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_parameters(use_temp=False):\n",
    "    \"\"\"Retrieve parameters from the configuration files or use defaults, preferring generation_config if available.\"\"\"\n",
    "    # Initialize default parameters\n",
    "    config_params = dict()\n",
    "\n",
    "    # Extract parameters from config.json, if available\n",
    "    try:\n",
    "        config_params.update(extract_from_config('config.json'))\n",
    "    except FileNotFoundError:\n",
    "        print(\"config.json not found. Using default values.\")\n",
    "\n",
    "    # Extract parameters from generation_config.json, if available and prefer these values\n",
    "    try:\n",
    "        gen_params = extract_from_generation_config('generation_config.json')\n",
    "        # Update config_params with values from gen_params, if they are not None\n",
    "        for key, value in gen_params.items():\n",
    "            if value is not None:\n",
    "                config_params[key] = value\n",
    "    except FileNotFoundError:\n",
    "        print(\"generation_config.json not found. Using default generation values.\")\n",
    "\n",
    "    # Ensure that temperature ('temp') is never used\n",
    "    if 'temp' in config_params and use_temp is False:\n",
    "        config_params['temp'] = 0  # Set temperature to 0\n",
    "\n",
    "    return config_params\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'ctx_size': 8192, 'rope_freq_base': 10000.0, 'rope_scaling': None, 'rope_scaling_type': None, 'torch_dtype': 'bfloat16', 'top_p': 0.95, 'temp': 0, 'repeat_penalty': 1.2, 'repeat_last_n': None, 'min_p': None, 'top_k': None, 'presence_penalty': None, 'frequency_penalty': None, 'mirostat': None, 'mirostat_lr': None, 'mirostat_ent': None, 'tfs': None, 'typical': None}\n"
     ]
    }
   ],
   "source": [
    "# Extract configuration parameters\n",
    "config_params = get_parameters()\n",
    "print(config_params)\n",
    "\n",
    "base_precision = unquantized[config_params[\"torch_dtype\"]]\n",
    "\n",
    "base_model = f'{base_model_name}_{base_precision}.gguf'\n",
    "base_perplexity_file = f\"perplexity_{base_precision}.txt\"\n",
    "\n",
    "threads = max(multiprocessing.cpu_count() - 1, 1)\n",
    "batch_size = 512\n",
    "ubatch_size = 128\n",
    "dataset_file = \"imatrix/oscar/imatrix-dataset.txt\"  \n",
    "ppl_file = \"ppl_test_data.txt\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quantization types:  ['IQ2_XS', 'IQ3_M', 'IQ3_S', 'IQ3_XS', 'IQ3_XXS', 'IQ4_NL', 'IQ4_XS', 'Q3_K_L', 'Q3_K_M', 'Q3_K_S', 'Q4_K_M', 'Q4_K_S', 'Q5_K_M', 'Q5_K_S', 'Q6_K', 'Q8_0', 'TQ1_0', 'TQ2_0']\n"
     ]
    }
   ],
   "source": [
    "# Load YAML file and extract quantization types\n",
    "yaml_file = 'quantizations.yaml'\n",
    "with open(yaml_file, 'r') as file:\n",
    "    data = yaml.safe_load(file)\n",
    "\n",
    "# Extract the list of quantization types\n",
    "quantization_types = data['quantizations']\n",
    "print(\"Quantization types: \", quantization_types)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Quantization parameters\n",
    "use_leave_output_tensor = True  # Set to False if you don't want to use --leave-output-tensor\n",
    "\n",
    "# Optional importance matrix path (set to None if you don't want to include --imatrix)\n",
    "imatrix_path = \"imatrix/oscar/imatrix.dat\"  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def quantize_model(\n",
    "    quantization_type, \n",
    "    base_model, \n",
    "    base_model_name, \n",
    "    path_to_llamacpp=\"\",\n",
    "    imatrix_path=None, \n",
    "    use_leave_output_tensor=True,\n",
    "    output_dir=\".\"\n",
    "):\n",
    "    \"\"\"\n",
    "    Quantize the base model into the specified quantization type.\n",
    "\n",
    "    Parameters:\n",
    "    - quantization_type (str): The type of quantization (e.g., \"Q4_0\", \"Q5_K_M\").\n",
    "    - base_model (str): Path to the base model file (e.g., \"salamandra-2b_bf16.gguf\").\n",
    "    - base_model_name (str): The base name of the model (e.g., \"salamandra-2b\").\n",
    "    - path_to_llamacpp (str): Path to the llama-quantize binary.\n",
    "    - imatrix_path (str, optional): Path to the importance matrix file. Default is None.\n",
    "    - use_leave_output_tensor (bool): Whether to include the --leave-output-tensor flag. Default is True.\n",
    "    - output_dir (str): Directory where the quantized models and logs will be saved. Default is current directory.\n",
    "\n",
    "    Returns:\n",
    "    - None\n",
    "    \"\"\"\n",
    "    # Construct the output model path\n",
    "    output_model = os.path.join(output_dir, f\"{base_model_name}_{quantization_type}.gguf\")\n",
    "\n",
    "    # Check if the quantized model already exists\n",
    "    if os.path.exists(output_model):\n",
    "        print(f\"Quantized model {output_model} already exists. Skipping quantization.\")\n",
    "        return\n",
    "\n",
    "    # Build the llama-quantize command\n",
    "    command_parts = [\n",
    "        os.path.join(path_to_llamacpp, \"llama-quantize\")\n",
    "    ]\n",
    "\n",
    "    # Conditionally add the --imatrix argument if the path is provided\n",
    "    if imatrix_path:\n",
    "        command_parts.append(f\"--imatrix {imatrix_path}\")\n",
    "\n",
    "    # Conditionally add the --leave-output-tensor argument based on the external boolean\n",
    "    if use_leave_output_tensor:\n",
    "        command_parts.append(\"--leave-output-tensor\")\n",
    "\n",
    "    # Add base model, output model, and quantization type\n",
    "    command_parts.extend([\n",
    "        f\"{base_model}\",\n",
    "        f\"\\\"{output_model}\\\"\",\n",
    "        f\"{quantization_type}\"\n",
    "    ])\n",
    "\n",
    "    # Redirect output to a log file for each quantization type\n",
    "    log_file = os.path.join(output_dir, f\"{quantization_type}_log.txt\")\n",
    "    command_parts.append(f\"> \\\"{log_file}\\\" 2>&1\")\n",
    "\n",
    "    # Join the command parts into a single command string\n",
    "    quantize_command = \" \".join(command_parts)\n",
    "\n",
    "    # Run the quantization command\n",
    "    print(f\"Quantizing model to {quantization_type} format with command: {quantize_command}\")\n",
    "    result = subprocess.run(quantize_command, shell=True, text=True)\n",
    "    if result.returncode != 0:\n",
    "        print(f\"Error during quantization to {quantization_type}. Check {log_file} for details.\")\n",
    "    else:\n",
    "        print(f\"Successfully quantized model to {quantization_type} and saved as {output_model}.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_command(command):\n",
    "    \"\"\"Function to run a command and capture output\"\"\"\n",
    "    print(f\"Running command: {command}\")\n",
    "    result = subprocess.run(command, shell=True, capture_output=True, text=True)\n",
    "    if result.returncode != 0:\n",
    "        print(f\"Error executing command: {result.stderr}\")\n",
    "    return result.stdout\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_perplexity(output):\n",
    "    \"\"\"extract perplexity from the output\"\"\"\n",
    "    match = re.search(r\"Final estimate: PPL = ([\\d.]+)\", output)\n",
    "    if match:\n",
    "        return float(match.group(1))\n",
    "    return None\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_command(model, output_file, ppl_file, config_params, threads=8, batch_size=512, ubatch_size=128):\n",
    "    \"\"\"Build the perplexity command based on the provided parameters.\"\"\"\n",
    "    command_parts = [\n",
    "        \"/Users/macdev/Downloads/build/bin/llama-perplexity\",\n",
    "        f\"-m {model}\",\n",
    "        f\"-f {ppl_file}\",\n",
    "        \"--perplexity\",\n",
    "    ]\n",
    "\n",
    "    # Add parameters only if they are set in config_params\n",
    "    if config_params.get('ctx_size') is not None:\n",
    "        command_parts.append(f\"--ctx-size {config_params['ctx_size']}\")\n",
    "    if config_params.get('rope_freq_base') is not None:\n",
    "        command_parts.append(f\"--rope-freq-base {config_params['rope_freq_base']}\")\n",
    "    if config_params.get('rope_freq_scale') is not None:\n",
    "        command_parts.append(f\"--rope-freq-scale {config_params['rope_freq_scale']}\")\n",
    "    if config_params.get('rope_scaling_type') is not None:\n",
    "        command_parts.append(f\"--rope-scaling {config_params['rope_scaling_type']}\")\n",
    "\n",
    "    # Add sampling-related parameters if they are set\n",
    "    if config_params.get('top_p') is not None:\n",
    "        command_parts.append(f\"--top-p {config_params['top_p']}\")\n",
    "    if config_params.get('repeat_penalty') is not None:\n",
    "        command_parts.append(f\"--repeat-penalty {config_params['repeat_penalty']}\")\n",
    "    if config_params.get('repeat_last_n') is not None:\n",
    "        command_parts.append(f\"--repeat-last-n {config_params['repeat_last_n']}\")\n",
    "\n",
    "    # Do not include `temp` as it's set to 0 in `get_parameters` if `use_temp` is False\n",
    "    # Only add if temp is non-zero (if `use_temp` is True in get_parameters)\n",
    "    if config_params.get('temp') is not None and config_params['temp'] != 0:\n",
    "        command_parts.append(f\"--temp {config_params['temp']}\")\n",
    "\n",
    "    # Add fixed parameters for threads and batch sizes\n",
    "    command_parts.extend([\n",
    "        f\"--threads {threads}\",\n",
    "        f\"--batch-size {batch_size}\",\n",
    "        f\"--ubatch-size {ubatch_size}\",\n",
    "    ])\n",
    "\n",
    "    # Redirect output to file\n",
    "    command = \" \".join(command_parts) + f\" > {output_file} 2>&1\"\n",
    "    return command\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Measure perplexity for the base model\n",
    "if os.path.exists(f'perplexity_{base_precision}.txt'):\n",
    "        with open(base_perplexity_file, 'r') as file:\n",
    "                base_output = file.read()\n",
    "else:\n",
    "        base_command = build_command(base_model, base_perplexity_file, ppl_file, config_params=config_params, threads=threads, batch_size=batch_size, ubatch_size=        ubatch_size)\n",
    "        base_output = run_command(base_command)\n",
    "base_perplexity = extract_perplexity(base_output)\n",
    "calculated_perplexity_recently = False # This will be set to True later"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quantizing model to IQ2_XS format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_IQ2_XS.gguf\" IQ2_XS > \"./IQ2_XS_log.txt\" 2>&1\n",
      "Successfully quantized model to IQ2_XS and saved as ./salamandra-2b_IQ2_XS.gguf.\n",
      "Quantizing model to IQ3_M format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_IQ3_M.gguf\" IQ3_M > \"./IQ3_M_log.txt\" 2>&1\n",
      "Successfully quantized model to IQ3_M and saved as ./salamandra-2b_IQ3_M.gguf.\n",
      "Quantizing model to IQ3_S format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_IQ3_S.gguf\" IQ3_S > \"./IQ3_S_log.txt\" 2>&1\n",
      "Successfully quantized model to IQ3_S and saved as ./salamandra-2b_IQ3_S.gguf.\n",
      "Quantizing model to IQ3_XS format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_IQ3_XS.gguf\" IQ3_XS > \"./IQ3_XS_log.txt\" 2>&1\n",
      "Successfully quantized model to IQ3_XS and saved as ./salamandra-2b_IQ3_XS.gguf.\n",
      "Quantizing model to IQ3_XXS format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_IQ3_XXS.gguf\" IQ3_XXS > \"./IQ3_XXS_log.txt\" 2>&1\n",
      "Successfully quantized model to IQ3_XXS and saved as ./salamandra-2b_IQ3_XXS.gguf.\n",
      "Quantizing model to IQ4_NL format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_IQ4_NL.gguf\" IQ4_NL > \"./IQ4_NL_log.txt\" 2>&1\n",
      "Successfully quantized model to IQ4_NL and saved as ./salamandra-2b_IQ4_NL.gguf.\n",
      "Quantizing model to IQ4_XS format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_IQ4_XS.gguf\" IQ4_XS > \"./IQ4_XS_log.txt\" 2>&1\n",
      "Successfully quantized model to IQ4_XS and saved as ./salamandra-2b_IQ4_XS.gguf.\n",
      "Quantizing model to Q3_K_L format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_Q3_K_L.gguf\" Q3_K_L > \"./Q3_K_L_log.txt\" 2>&1\n",
      "Successfully quantized model to Q3_K_L and saved as ./salamandra-2b_Q3_K_L.gguf.\n",
      "Quantizing model to Q3_K_M format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_Q3_K_M.gguf\" Q3_K_M > \"./Q3_K_M_log.txt\" 2>&1\n",
      "Successfully quantized model to Q3_K_M and saved as ./salamandra-2b_Q3_K_M.gguf.\n",
      "Quantizing model to Q3_K_S format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_Q3_K_S.gguf\" Q3_K_S > \"./Q3_K_S_log.txt\" 2>&1\n",
      "Successfully quantized model to Q3_K_S and saved as ./salamandra-2b_Q3_K_S.gguf.\n",
      "Quantizing model to Q4_K_M format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_Q4_K_M.gguf\" Q4_K_M > \"./Q4_K_M_log.txt\" 2>&1\n",
      "Successfully quantized model to Q4_K_M and saved as ./salamandra-2b_Q4_K_M.gguf.\n",
      "Quantizing model to Q4_K_S format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_Q4_K_S.gguf\" Q4_K_S > \"./Q4_K_S_log.txt\" 2>&1\n",
      "Successfully quantized model to Q4_K_S and saved as ./salamandra-2b_Q4_K_S.gguf.\n",
      "Quantizing model to Q5_K_M format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_Q5_K_M.gguf\" Q5_K_M > \"./Q5_K_M_log.txt\" 2>&1\n",
      "Successfully quantized model to Q5_K_M and saved as ./salamandra-2b_Q5_K_M.gguf.\n",
      "Quantizing model to Q5_K_S format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_Q5_K_S.gguf\" Q5_K_S > \"./Q5_K_S_log.txt\" 2>&1\n",
      "Successfully quantized model to Q5_K_S and saved as ./salamandra-2b_Q5_K_S.gguf.\n",
      "Quantizing model to Q6_K format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_Q6_K.gguf\" Q6_K > \"./Q6_K_log.txt\" 2>&1\n",
      "Successfully quantized model to Q6_K and saved as ./salamandra-2b_Q6_K.gguf.\n",
      "Quantizing model to Q8_0 format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_Q8_0.gguf\" Q8_0 > \"./Q8_0_log.txt\" 2>&1\n",
      "Successfully quantized model to Q8_0 and saved as ./salamandra-2b_Q8_0.gguf.\n",
      "Quantizing model to TQ1_0 format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_TQ1_0.gguf\" TQ1_0 > \"./TQ1_0_log.txt\" 2>&1\n",
      "Error during quantization to TQ1_0. Check ./TQ1_0_log.txt for details.\n",
      "Quantizing model to TQ2_0 format with command: /Users/macdev/Downloads/build/bin/llama-quantize --imatrix imatrix/oscar/imatrix.dat --leave-output-tensor salamandra-2b_bf16.gguf \"./salamandra-2b_TQ2_0.gguf\" TQ2_0 > \"./TQ2_0_log.txt\" 2>&1\n",
      "Error during quantization to TQ2_0. Check ./TQ2_0_log.txt for details.\n"
     ]
    }
   ],
   "source": [
    "# Quantize the models\n",
    "for quant in quantization_types:\n",
    "    quantize_model(\n",
    "        quantization_type=quant,\n",
    "        base_model=base_model,\n",
    "        base_model_name=base_model_name,\n",
    "        path_to_llamacpp=path_to_llamacpp,\n",
    "        imatrix_path=imatrix_path,\n",
    "        use_leave_output_tensor=use_leave_output_tensor,\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_IQ2_XS.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_IQ2_XS.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_IQ3_M.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_IQ3_M.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_IQ3_S.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_IQ3_S.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_IQ3_XS.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_IQ3_XS.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_IQ3_XXS.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_IQ3_XXS.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_IQ4_NL.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_IQ4_NL.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_IQ4_XS.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_IQ4_XS.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_Q3_K_L.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q3_K_L.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_Q3_K_M.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q3_K_M.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_Q3_K_S.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q3_K_S.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_Q4_K_M.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q4_K_M.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_Q4_K_S.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q4_K_S.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_Q5_K_M.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q5_K_M.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_Q5_K_S.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q5_K_S.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_Q6_K.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q6_K.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_Q8_0.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_Q8_0.txt 2>&1\n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_TQ1_0.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_TQ1_0.txt 2>&1\n",
      "Error executing command: \n",
      "Running command: /Users/macdev/Downloads/build/bin/llama-perplexity -m salamandra-2b_TQ2_0.gguf -f ppl_test_data.txt --perplexity --ctx-size 8192 --rope-freq-base 10000.0 --top-p 0.95 --repeat-penalty 1.2 --threads 15 --batch-size 512 --ubatch-size 128 > perplexity_TQ2_0.txt 2>&1\n",
      "Error executing command: \n"
     ]
    }
   ],
   "source": [
    "# Measure perplexity for each quantized model\n",
    "perplexity_results = dict()\n",
    "perplexity_results[base_precision] = base_perplexity\n",
    "for quant in quantization_types:\n",
    "    calculated_perplexity_recently = True\n",
    "    \n",
    "    model = f\"{base_model_name}_{quant}.gguf\"\n",
    "    output_file = f\"perplexity_{quant}.txt\"\n",
    "\n",
    "    command = build_command(model, output_file, ppl_file, config_params=config_params, threads=threads, batch_size=batch_size, ubatch_size=        ubatch_size)\n",
    "    output = run_command(command)\n",
    "\n",
    "    perplexity = extract_perplexity(output)\n",
    "    perplexity_results[quant] = perplexity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load previous measurements if we didnt just measure perplexity for each quantized model\n",
    "if not calculated_perplexity_recently:\n",
    "    perplexity_results = dict()\n",
    "    perplexity_results[base_precision] = base_perplexity\n",
    "\n",
    "    for quant in quantization_types:\n",
    "        output_file = f\"perplexity_{quant}.txt\"\n",
    "        try:\n",
    "            with open(output_file, 'r') as file:\n",
    "                output = file.read()\n",
    "            perplexity = extract_perplexity(output)\n",
    "        except FileNotFoundError:\n",
    "            print(f\"Output file {output_file} not found.\")\n",
    "            perplexity = None\n",
    "\n",
    "        perplexity_results[quant] = perplexity\n",
    "\n",
    "    # Calculate ln(PPL(Q)/PPL(fp16)) and generate the table\n",
    "    print(\"\\nPerplexity Comparison Table:\")\n",
    "    print(f\"{'Quantization Type':<20} {'PPL(Q)':<10} {'ln(PPL(Q)/PPL(fp16))':<25}\")\n",
    "    print(\"=\" * 55)\n",
    "    for quant, ppl in perplexity_results.items():\n",
    "        if ppl and base_perplexity:\n",
    "            ln_ratio = round(math.log(ppl / base_perplexity), 6)\n",
    "            print(f\"{quant:<20} {ppl:<10} {ln_ratio:<25}\")\n",
    "\n",
    "    print(perplexity_results)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate ln(PPL(Q)/PPL(fp16)) and generate the table\n",
    "print(\"\\nPerplexity Comparison Table:\")\n",
    "print(f\"{'Quantization Type':<20} {'PPL(Q)':<10} {'ln(PPL(Q)/PPL(fp16))':<25}\")\n",
    "print(\"=\" * 55)\n",
    "for quant, ppl in perplexity_results.items():\n",
    "    if ppl and base_perplexity:\n",
    "        ln_ratio = round(math.log(ppl / base_perplexity), 6)\n",
    "        print(f\"{quant:<20} {ppl:<10} {ln_ratio:<25}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Output file perplexity_TQ1_0.txt not found.\n",
      "Output file perplexity_TQ2_0.txt not found.\n",
      "\n",
      "Perplexity Comparison Table:\n",
      "Quantization Type    PPL(Q)     ln(PPL(Q)/PPL(fp16))     \n",
      "=======================================================\n",
      "bf16                 14.0431    0.0                      \n",
      "IQ2_XS               28.9052    0.72189                  \n",
      "IQ3_M                15.1995    0.079131                 \n",
      "IQ3_S                15.8627    0.121839                 \n",
      "IQ3_XS               16.7197    0.174456                 \n",
      "IQ3_XXS              17.6216    0.226994                 \n",
      "IQ4_NL               14.5534    0.035693                 \n",
      "IQ4_XS               14.5638    0.036408                 \n",
      "Q3_K_L               15.0444    0.068875                 \n",
      "Q3_K_M               15.2582    0.082986                 \n",
      "Q3_K_S               15.839     0.120344                 \n",
      "Q4_K_M               14.399     0.025028                 \n",
      "Q4_K_S               14.4338    0.027442                 \n",
      "Q5_K_M               14.1299    0.006162                 \n",
      "Q5_K_S               14.1497    0.007562                 \n",
      "Q6_K                 14.0675    0.001736                 \n",
      "Q8_0                 14.0495    0.000456                 \n",
      "{'bf16': 14.0431, 'IQ2_XS': 28.9052, 'IQ3_M': 15.1995, 'IQ3_S': 15.8627, 'IQ3_XS': 16.7197, 'IQ3_XXS': 17.6216, 'IQ4_NL': 14.5534, 'IQ4_XS': 14.5638, 'Q3_K_L': 15.0444, 'Q3_K_M': 15.2582, 'Q3_K_S': 15.839, 'Q4_K_M': 14.399, 'Q4_K_S': 14.4338, 'Q5_K_M': 14.1299, 'Q5_K_S': 14.1497, 'Q6_K': 14.0675, 'Q8_0': 14.0495, 'TQ1_0': None, 'TQ2_0': None}\n"
     ]
    }
   ],
   "source": [
    "perplexity_results = dict()\n",
    "perplexity_results[base_precision] = base_perplexity\n",
    "\n",
    "for quant in quantization_types:\n",
    "    output_file = f\"perplexity_{quant}.txt\"\n",
    "    try:\n",
    "        with open(output_file, 'r') as file:\n",
    "            output = file.read()\n",
    "        perplexity = extract_perplexity(output)\n",
    "    except FileNotFoundError:\n",
    "        print(f\"Output file {output_file} not found.\")\n",
    "        perplexity = None\n",
    "\n",
    "    perplexity_results[quant] = perplexity\n",
    "\n",
    "# Calculate ln(PPL(Q)/PPL(fp16)) and generate the table\n",
    "print(\"\\nPerplexity Comparison Table:\")\n",
    "print(f\"{'Quantization Type':<20} {'PPL(Q)':<10} {'ln(PPL(Q)/PPL(fp16))':<25}\")\n",
    "print(\"=\" * 55)\n",
    "for quant, ppl in perplexity_results.items():\n",
    "    if ppl and base_perplexity:\n",
    "        ln_ratio = round(math.log(ppl / base_perplexity), 6)\n",
    "        print(f\"{quant:<20} {ppl:<10} {ln_ratio:<25}\")\n",
    "\n",
    "print(perplexity_results)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}