derek-thomas
/

tgi-benchmark-notebooks

Model card Files Files and versions Community

derek-thomas HF staff commited on May 2

Commit

da52452

•

1 Parent(s): 767ef0d

Adding Notebooks

Browse files

Files changed (2) hide show

notebooks/TGI-benchmark.ipynb +105 -0
notebooks/TGI-launcher.ipynb +0 -0

notebooks/TGI-benchmark.ipynb ADDED Viewed

	@@ -0,0 +1,105 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "694df6d6-a521-4dab-977b-2828d4250781",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Text Generation Benchmarking tool\n",
+      "\n",
+      "\u001b[1m\u001b[4mUsage:\u001b[0m \u001b[1mtext-generation-benchmark\u001b[0m [OPTIONS] \u001b[1m--tokenizer-name\u001b[0m <TOKENIZER_NAME>\n",
+      "\n",
+      "\u001b[1m\u001b[4mOptions:\u001b[0m\n",
+      "  \u001b[1m-t\u001b[0m, \u001b[1m--tokenizer-name\u001b[0m <TOKENIZER_NAME>\n",
+      "          The name of the tokenizer (as in model_id on the huggingface hub, or local path) [env: TOKENIZER_NAME=]\n",
+      "      \u001b[1m--revision\u001b[0m <REVISION>\n",
+      "          The revision to use for the tokenizer if on the hub [env: REVISION=] [default: main]\n",
+      "  \u001b[1m-b\u001b[0m, \u001b[1m--batch-size\u001b[0m <BATCH_SIZE>\n",
+      "          The various batch sizes to benchmark for, the idea is to get enough batching to start seeing increased latency, this usually means you're moving from memory bound (usual as BS=1) to compute bound, and this is a sweet spot for the maximum batch size for the model under test\n",
+      "  \u001b[1m-s\u001b[0m, \u001b[1m--sequence-length\u001b[0m <SEQUENCE_LENGTH>\n",
+      "          This is the initial prompt sent to the text-generation-server length in token. Longer prompt will slow down the benchmark. Usually the latency grows somewhat linearly with this for the prefill step [env: SEQUENCE_LENGTH=] [default: 10]\n",
+      "  \u001b[1m-d\u001b[0m, \u001b[1m--decode-length\u001b[0m <DECODE_LENGTH>\n",
+      "          This is how many tokens will be generated by the server and averaged out to give the `decode` latency. This is the *critical* number you want to optimize for LLM spend most of their time doing decoding [env: DECODE_LENGTH=] [default: 8]\n",
+      "  \u001b[1m-r\u001b[0m, \u001b[1m--runs\u001b[0m <RUNS>\n",
+      "          How many runs should we average from [env: RUNS=] [default: 10]\n",
+      "  \u001b[1m-w\u001b[0m, \u001b[1m--warmups\u001b[0m <WARMUPS>\n",
+      "          Number of warmup cycles [env: WARMUPS=] [default: 1]\n",
+      "  \u001b[1m-m\u001b[0m, \u001b[1m--master-shard-uds-path\u001b[0m <MASTER_SHARD_UDS_PATH>\n",
+      "          The location of the grpc socket. This benchmark tool bypasses the router completely and directly talks to the gRPC processes [env: MASTER_SHARD_UDS_PATH=] [default: /tmp/text-generation-server-0]\n",
+      "      \u001b[1m--temperature\u001b[0m <TEMPERATURE>\n",
+      "          Generation parameter in case you want to specifically test/debug particular decoding strategies, for full doc refer to the `text-generation-server` [env: TEMPERATURE=]\n",
+      "      \u001b[1m--top-k\u001b[0m <TOP_K>\n",
+      "          Generation parameter in case you want to specifically test/debug particular decoding strategies, for full doc refer to the `text-generation-server` [env: TOP_K=]\n",
+      "      \u001b[1m--top-p\u001b[0m <TOP_P>\n",
+      "          Generation parameter in case you want to specifically test/debug particular decoding strategies, for full doc refer to the `text-generation-server` [env: TOP_P=]\n",
+      "      \u001b[1m--typical-p\u001b[0m <TYPICAL_P>\n",
+      "          Generation parameter in case you want to specifically test/debug particular decoding strategies, for full doc refer to the `text-generation-server` [env: TYPICAL_P=]\n",
+      "      \u001b[1m--repetition-penalty\u001b[0m <REPETITION_PENALTY>\n",
+      "          Generation parameter in case you want to specifically test/debug particular decoding strategies, for full doc refer to the `text-generation-server` [env: REPETITION_PENALTY=]\n",
+      "      \u001b[1m--frequency-penalty\u001b[0m <FREQUENCY_PENALTY>\n",
+      "          Generation parameter in case you want to specifically test/debug particular decoding strategies, for full doc refer to the `text-generation-server` [env: FREQUENCY_PENALTY=]\n",
+      "      \u001b[1m--watermark\u001b[0m\n",
+      "          Generation parameter in case you want to specifically test/debug particular decoding strategies, for full doc refer to the `text-generation-server` [env: WATERMARK=]\n",
+      "      \u001b[1m--do-sample\u001b[0m\n",
+      "          Generation parameter in case you want to specifically test/debug particular decoding strategies, for full doc refer to the `text-generation-server` [env: DO_SAMPLE=]\n",
+      "      \u001b[1m--top-n-tokens\u001b[0m <TOP_N_TOKENS>\n",
+      "          Generation parameter in case you want to specifically test/debug particular decoding strategies, for full doc refer to the `text-generation-server` [env: TOP_N_TOKENS=]\n",
+      "  \u001b[1m-h\u001b[0m, \u001b[1m--help\u001b[0m\n",
+      "          Print help (see more with '--help')\n",
+      "  \u001b[1m-V\u001b[0m, \u001b[1m--version\u001b[0m\n",
+      "          Print version\n"
+     ]
+    }
+   ],
+   "source": [
+    "!text-generation-benchmark -h"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c8afc9d5-f624-4d7f-a64f-08af02a4aaff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!text-generation-benchmark \\\n",
+    "--tokenizer-name astronomer/Llama-3-8B-Instruct-GPTQ-8-Bit \\\n",
+    "--sequence-length 3000 \\\n",
+    "--decode-length 300 \\\n",
+    "--batch-size 1 \\\n",
+    "--batch-size 2 \\\n",
+    "--batch-size 3 \\\n",
+    "--batch-size 4 \\\n",
+    "--batch-size 5"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

notebooks/TGI-launcher.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff