Spaces:

SadP0i
/

GGUF-Model-VRAM-Calculator

Running

File size: 22,486 Bytes

<!DOCTYPE html>
<html lang="en">

<head>
  <meta charset="UTF-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <script>

    function strToHtml(str) {

      let parser = new DOMParser();

      return parser.parseFromString(str, "text/html");

    }



    //Short, jQuery-independent function to read html table and write them into an Array.

    //Kudos to RobG at StackOverflow

    function tableToObj(table) {

      var rows = table.rows;

      var propCells = rows[0].cells;

      var propNames = [];

      var results = [];

      var obj, row, cells;



      // Use the first row for the property names

      // Could use a header section but result is the same if

      // there is only one header row

      for (var i = 0, iLen = propCells.length; i < iLen; i++) {

        propNames.push(

          (propCells[i].textContent || propCells[i].innerText).trim()

        );

      }



      // Use the rows for data

      // Could use tbody rows here to exclude header & footer

      // but starting from 1 gives required result

      for (var j = 1, jLen = rows.length; j < jLen; j++) {

        cells = rows[j].cells;

        obj = {};



        for (var k = 0; k < iLen; k++) {

          obj[propNames[k]] = (

            cells[k].textContent || cells[k].innerText

          ).trim();

        }

        results.push(obj);

      }

      return results;

    }



    function formatGpu(gpus) {

      return gpus.map(

        (g) => `${g["Product Name"]} - ${g["Memory"].split(",")[0]}`

      );

    }



    const gguf_quants = {

      "IQ1_S": 1.56,

      "IQ1_M": 1.75,

      "IQ2_XXS": 2.06,

      "IQ2_XS": 2.31,

      "IQ2_S": 2.5,

      "IQ3_XXS": 3.06,

      "IQ3_XS": 3.3,

      "IQ3_S": 3.44,

      "IQ3_M": 3.66,

      "Q2_K": 3.35,

      "Q3_K_S": 3.5,

      "Q3_K_M": 3.91,

      "Q3_K_L": 4.27,

      "IQ4_XS": 4.25,

      "Q4_0": 4.55,

      "Q4_K_S": 4.58,

      "Q4_K_M": 4.85,

      "Q5_0": 5.54,

      "Q5_K_S": 5.54,

      "Q5_K_M": 5.69,

      "Q6_K": 6.59,

      "Q8_0": 8.5,

    }



    function sanitize(string) {

      const map = {

        '&': '&amp;',

        '<': '&lt;',

        '>': '&gt;',

        '"': '&quot;',

        "'": '&#x27;',

        "/": '_',

        '(': '',

        ')': '',

        '{': '',

        '}': '',

        '[': '',

        ']': '',

      };

      const reg = /[&<>"'/\[\]\(\)\{\}]/ig;

      return string.replace(reg, (match) => (map[match]));

    }



    async function modelConfig(hf_model) {

      let config = {};

      let responseText;



      try {

        let modelInfoPromise = fetch(

          `https://huggingface.co/${hf_model}/raw/main/config.json`

        );

        responseText = await modelInfoPromise.then(r => r.text());

        config = JSON.parse(responseText);

      } catch (err) {

        alert(sanitize(responseText))

        return config;

      }



      let model_size = 0

      try {

        model_size = (await fetch(`https://huggingface.co/${hf_model}/resolve/main/model.safetensors.index.json`).then(r => r.json()))["metadata"]["total_size"] / 2

        if (isNaN(model_size)) {

          throw new Erorr("no size in safetensors metadata")

        }

      } catch (e) {

        try {

          model_size = (await fetch(`https://huggingface.co/${hf_model}/resolve/main/pytorch_model.bin.index.json`).then(r => r.json()))["metadata"]["total_size"] / 2

          if (isNaN(model_size)) {

            throw new Erorr("no size in pytorch metadata")

          }

        } catch {

          let model_page = await fetch(

            "https://corsproxy.io/?" + encodeURIComponent(`https://huggingface.co/${hf_model}`)

          ).then(r => r.text())

          let el = document.createElement('html');

          el.innerHTML = model_page

          let params_el = el.querySelector('div[data-target="ModelSafetensorsParams"]')

          if (params_el !== null) {

            model_size = JSON.parse(params_el.attributes.getNamedItem("data-props").value)["safetensors"]["total"]

          } else {

            params_el = el.querySelector('div[data-target="ModelHeader"]')

            model_size = JSON.parse(params_el.attributes.getNamedItem("data-props").value)["model"]["safetensors"]["total"]

          }

        }

      }

      config.parameters = model_size

      return config

    }



    function inputBuffer(context = 8192, model_config, bsz = 512) {

      /* Calculation taken from github:ggerganov/llama.cpp/llama.cpp:11248

        ctx->inp_tokens  = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);

        ctx->inp_embd    = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);

        ctx->inp_pos     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);

        ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);

        ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);

        ctx->inp_sum     = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, 1, cparams.n_batch);



        n_embd is hidden size (github:ggeranov/llama.cpp/convert.py:248)

      */

      const inp_tokens = bsz

      const inp_embd = model_config["hidden_size"] * bsz

      const inp_pos = bsz

      const inp_KQ_mask = context * bsz

      const inp_K_shift = context

      const inp_sum = bsz



      return inp_tokens + inp_embd + inp_pos + inp_KQ_mask + inp_K_shift + inp_sum

    }



    function computeBuffer(context = 8192, model_config, bsz = 512) {

      if (bsz != 512) {

        alert("batch size other than 512 is currently not supported for the compute buffer, using batchsize 512 for compute buffer calculation, end result result will be an overestimatition")

      }

      return (context / 1024 * 2 + 0.75) * model_config["num_attention_heads"] * 1024 * 1024

    }



    function kvCache(context = 8192, model_config, cache_bit = 16) {

      const n_gqa = model_config["num_attention_heads"] / model_config["num_key_value_heads"]

      const n_embd_gqa = model_config["hidden_size"] / n_gqa

      const n_elements = n_embd_gqa * (model_config["num_hidden_layers"] * context)

      const size = 2 * n_elements

      return size * (cache_bit / 8)

    }



    function contextSize(context = 8192, model_config, bsz = 512, cache_bit = 16) {

      return Number.parseFloat((inputBuffer(context, model_config, bsz) + kvCache(context, model_config, cache_bit) + computeBuffer(context, model_config, bsz)).toFixed(2))

    }



    function modelSize(model_config, bpw = 4.5) {

      return Number.parseFloat((model_config["parameters"] * bpw / 8).toFixed(2))

    }



    async function calculateSizes(format, context_loc) {



      format = "gguf"



      try {

        const model_config = await modelConfig(document.getElementById("modelsearch").value)

        const context = parseInt(document.getElementById("contextsize").value)

        let bsz = 512

        let cache_bit = 16

        let bpw = 0

        if (format === "gguf") {

          bsz = parseInt(document.getElementById("batchsize").value)

          bpw = gguf_quants[document.getElementById("quantsize").innerText]



        } else if (format == "exl2") {

          cache_bit = Number.parseInt(document.getElementById("kvCache").value)

          bpw = Number.parseFloat(document.getElementById("bpw").value)

        }



        const model_size = modelSize(model_config, bpw)

        const context_size = contextSize(context, model_config, bsz, cache_bit)

        const total_size = ((model_size + context_size) / 2 ** 30)

        document.getElementById("resultmodel").innerText = (model_size / 2 ** 30).toFixed(2)

        document.getElementById("resultcontext").innerText = (context_size / 2 ** 30).toFixed(2)

        const result_total_el = document.getElementById("resulttotal");

        result_total_el.innerText = total_size.toFixed(2)



        const allocated_vram = Number.parseInt(document.getElementById("maxvram").value);

        const vram = allocated_vram

        if (vram - total_size > 0.5) {

          result_total_el.style.backgroundColor = "#bef264"

        } else if (vram - total_size > 0) {

          result_total_el.style.backgroundColor = "#facc15"

        } else {

          result_total_el.style.backgroundColor = "#ef4444"

        }



        const layer_size = ((model_size / 2 ** 30) / model_config["num_hidden_layers"])

        const layer_size_el = document.getElementById("layersize");

        layer_size_el.innerText = layer_size.toFixed(2)



        const context_dealloc = context_loc === "vram" ? (context_size / 2 ** 30) : 0;

        const layers_offload = Math.floor((allocated_vram - context_dealloc) / layer_size)



        const layers_offload_el = document.getElementById("layersoffload");

        layers_offload_el.innerText = `${layers_offload > model_config["num_hidden_layers"] ? model_config["num_hidden_layers"] : Math.max(0, layers_offload)}/${model_config["num_hidden_layers"]}`



      } catch (e) {

        alert(e);

      }

    }

  </script>
  <link href="./styles.css" rel="stylesheet">
  <title>Can I split it? - GGUF VRAM Calculator</title>
</head>

<body class="p-8">
  <div x-data="{ format: 'gguf', context_loc: 'vram' }" class="flex flex-col max-h-screen items-center mt-16 gap-10">
    <div style="text-align: center;">
      <h1 class="text-xl font-semibold leading-6 text-gray-900">
        GGUF Model, Can I split it?
      </h1>
      <h3 class="font-semibold leading-6 text-gray-900">
        Based on <a href="https://huggingface.co/NyxKrage" style="color: blue;">NyxKrage</a>'s <a

          href="https://huggingface.co/spaces/NyxKrage/LLM-Model-VRAM-Calculator" style="color: blue;">LLM VRAM
          calculator</a>
      </h3>
    </div>
    <div class="flex flex-col gap-10">
      <div class="w-auto flex flex-col gap-4">
        <div class="relative">
          <label for="maxvram"

            class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900">
            Max Allocated VRAM
          </label>
          <input value="24" type="number" name="maxvram" id="maxvram" step="1"

            class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6" />
        </div>

        <!-- Model Selector -->


        <div class="flex flex-row gap-4 relative">
          <label for="contextsize"

            class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900">
            Model (unquantized)
          </label>
          <div

            class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"

            x-data="{

                          open: false,

                          value: 'Nexusflow/Starling-LM-7B-beta',

                          results: null,

                          toggle() {

                            if (this.open) {

                              return this.close()

                            }



                            this.$refs.input.focus()

                

                            this.open = true

                          },

                          close(focusAfter) {

                            if (! this.open) return

              

                            this.open = false

              

                            focusAfter && focusAfter.focus()

                          }

                        }" x-on:keydown.escape.prevent.stop="close($refs.input)" x-id="['model-typeahead']"

            class="relative">
            <!-- Input -->
            <input id="modelsearch" x-ref="input" x-on:click="toggle()"

              @keypress.debounce.150ms="results = (await

                    fetch('https://huggingface.co/api/quicksearch?type=model&q=' +

                    encodeURIComponent(value)).then(r => r.json())).models.filter(m => !m.id.includes('GGUF') && !m.id.includes('AWQ') && !m.id.includes('GPTQ') && !m.id.includes('exl2'));"

              :aria-expanded="open" :aria-controls="$id('model-typeahead')" x-model="value"

              class="flex justify-between items-center gap-2 w-full" />

            <!-- Panel -->
            <div x-ref="panel" x-show="open" x-transition.origin.top.left x-on:click.outside="close($refs.input)"

              :id="$id('model-typeahead')" style="display: none"

              class="absolute left-0 mt-4 w-full rounded-md bg-white shadow-sm ring-1 ring-inset ring-gray-300 z-10">
              <template x-for="result in results">
                <a @click="value = result.id; close($refs.input)" x-text="result.id"

                  class="flex cursor-pointer items-center gap-2 w-full first-of-type:rounded-t-md last-of-type:rounded-b-md px-4 py-2.5 text-left text-sm hover:bg-gray-500/5 disabled:text-gray-500"></a>
              </template>
            </div>
          </div>
        </div>


        <!-- Context Size Selector -->
        <div class="relative">
          <label for="contextsize"

            class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900">
            Context Size
          </label>
          <input value="8192" type="number" name="contextsize" id="contextsize" step="1024"

            class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6" />
        </div>

        <div class="relative">
          <label class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900">Context
            offloaded to</label>
          <fieldset x-model="context_loc"

            class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6">
            <legend class="sr-only">Context location</legend>
            <div class="space-y-4 sm:flex sm:items-center sm:space-x-10 sm:space-y-0">
              <div class="flex items-center">
                <input id="context-vram" name="context-allocation" type="radio" value="vram" checked

                  class="h-4 w-4 border-gray-300 text-indigo-600 focus:ring-indigo-600" />
                <label for="context-vram" class="ml-3 block text-sm font-medium leading-6 text-gray-900">VRAM</label>
              </div>
              <div class="flex items-center">
                <input id="context-ram" name="context-allocation" type="radio" value="ram"

                  class="h-4 w-4 border-gray-300 text-indigo-600 focus:ring-indigo-600" />
                <label for="context-ram" class="ml-3 block text-sm font-medium leading-6 text-gray-900">RAM</label>
              </div>
            </div>
          </fieldset>
        </div>

        <!-- GGUF Options -->
        <div x-show="format === 'gguf'" class="relative">
          <div class="flex flex-row gap-4">
            <label for="contextsize"

              class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900">
              Quantization Size
            </label>
            <div

              class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"

              x-data="{

                open: false,

                value: '',

                toggle() {

                  if (this.open) {

                    return this.close()

                  }



                  this.$refs.button.focus()

      

                  this.open = true

                },

                close(focusAfter) {

                  if (! this.open) return

    

                  this.open = false

    

                  focusAfter && focusAfter.focus()

                }

              }" x-on:keydown.escape.prevent.stop="close($refs.button)" x-id="['dropdown-button']" class="relative">
              <!-- Button -->
              <button x-ref="button" x-on:click="toggle()" :aria-expanded="open" :aria-controls="$id('dropdown-button')"

                type="button" id="quantsize" x-text="value.length === 0 ? 'Q4_K_S' : value"

                class="flex justify-between items-center gap-2 w-full">
                Q4_K_S

                <!-- Heroicon: chevron-down -->
                <svg xmlns="http://www.w3.org/2000/svg" class="h-5 w-5 text-gray-400" viewBox="0 0 20 20"

                  fill="currentColor">
                  <path fill-rule="evenodd"

                    d="M5.293 7.293a1 1 0 011.414 0L10 10.586l3.293-3.293a1 1 0 111.414 1.414l-4 4a1 1 0 01-1.414 0l-4-4a1 1 0 010-1.414z"

                    clip-rule="evenodd" />
                </svg>
              </button>

              <!-- Panel -->
              <div x-data="{ quants: [

                    'IQ1_S',

                    'IQ1_M',

                    'IQ2_XXS',

                    'IQ2_XS',

                    'IQ2_S',

                    'IQ3_XXS',

                    'IQ3_XS',

                    'IQ3_S',

                    'IQ3_M',

                    'Q2_K',

                    'Q3_K_S',

                    'Q3_K_M',

                    'Q3_K_L',

                    'IQ4_XS',

                    'Q4_0',

                    'Q4_K_S',

                    'Q4_K_M',

                    'Q5_0',

                    'Q5_K_S',

                    'Q5_K_M',

                    'Q6_K',

                    'Q8_0'

                  ]}" x-ref="panel" x-show="open" x-transition.origin.top.left x-on:click.outside="close($refs.button)"

                :id="$id('dropdown-button')" style="display: none"

                class="absolute left-0 mt-4 w-full rounded-md bg-white shadow-sm ring-1 ring-inset ring-gray-300 z-10">
                <template x-for="quant in quants">
                  <a @click="value = quant; close($refs.button)" x-text="quant"

                    class="flex cursor-pointer items-center gap-2 w-full first-of-type:rounded-t-md last-of-type:rounded-b-md px-4 py-2.5 text-left text-sm hover:bg-gray-500/5 disabled:text-gray-500"></a>
                </template>
              </div>
            </div>
            <div class="relative">
              <label for="batchsize"

                class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900">
                Batch Size
              </label>
              <input value="512" type="number" step="128" id="batchsize"

                class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6" />
            </div>
          </div>
        </div>
        <button type="button"

          class="rounded-md bg-slate-800 px-3 py-2 text-sm font-semibold text-white shadow-sm hover:bg-slate-700 focus-visible:outline focus-visible:outline-2 focus-visible:outline-offset-2 focus-visible:outline-indigo-600"

          @click="calculateSizes(format, context_loc)">
          Submit
        </button>
      </div>
      <div class="w-auto flex flex-col gap-4">
        <div class="relative">
          <label class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900">
            Model Size (GB)
          </label>
          <div id="resultmodel"

            class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6">
            4.20</div>
        </div>
        <div class="relative">
          <label class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900">
            Context Size (GB)
          </label>
          <div id="resultcontext"

            class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6">
            6.90</div>
        </div>
        <div class="relative">
          <label class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900">
            Total Size (GB)
          </label>
          <div id="resulttotal"

            class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6">
            420.69</div>
        </div>
        <div class="relative">
          <label class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900">
            Layer size (GB)
          </label>
          <div id="layersize"

            class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6">
            42.69</div>
        </div>
        <div class="relative">
          <label class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900">
            Layers offloaded to GPU (out of total)
          </label>
          <div id="layersoffload"

            class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6">
            42</div>
        </div>
      </div>
    </div>
  </div>
  <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/cdn.min.js"></script>
  <script defer>

    calculateSizes("gguf", "vram")

  </script>
</body>

</html>