derek-thomas HF staff commited on
Commit
da52452
1 Parent(s): 767ef0d

Adding Notebooks

Browse files
notebooks/TGI-benchmark.ipynb ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "694df6d6-a521-4dab-977b-2828d4250781",
7
+ "metadata": {
8
+ "tags": []
9
+ },
10
+ "outputs": [
11
+ {
12
+ "name": "stdout",
13
+ "output_type": "stream",
14
+ "text": [
15
+ "Text Generation Benchmarking tool\n",
16
+ "\n",
17
+ "\u001b[1m\u001b[4mUsage:\u001b[0m \u001b[1mtext-generation-benchmark\u001b[0m [OPTIONS] \u001b[1m--tokenizer-name\u001b[0m <TOKENIZER_NAME>\n",
18
+ "\n",
19
+ "\u001b[1m\u001b[4mOptions:\u001b[0m\n",
20
+ " \u001b[1m-t\u001b[0m, \u001b[1m--tokenizer-name\u001b[0m <TOKENIZER_NAME>\n",
21
+ " The name of the tokenizer (as in model_id on the huggingface hub, or local path) [env: TOKENIZER_NAME=]\n",
22
+ " \u001b[1m--revision\u001b[0m <REVISION>\n",
23
+ " The revision to use for the tokenizer if on the hub [env: REVISION=] [default: main]\n",
24
+ " \u001b[1m-b\u001b[0m, \u001b[1m--batch-size\u001b[0m <BATCH_SIZE>\n",
25
+ " The various batch sizes to benchmark for, the idea is to get enough batching to start seeing increased latency, this usually means you're moving from memory bound (usual as BS=1) to compute bound, and this is a sweet spot for the maximum batch size for the model under test\n",
26
+ " \u001b[1m-s\u001b[0m, \u001b[1m--sequence-length\u001b[0m <SEQUENCE_LENGTH>\n",
27
+ " This is the initial prompt sent to the text-generation-server length in token. Longer prompt will slow down the benchmark. Usually the latency grows somewhat linearly with this for the prefill step [env: SEQUENCE_LENGTH=] [default: 10]\n",
28
+ " \u001b[1m-d\u001b[0m, \u001b[1m--decode-length\u001b[0m <DECODE_LENGTH>\n",
29
+ " This is how many tokens will be generated by the server and averaged out to give the `decode` latency. This is the *critical* number you want to optimize for LLM spend most of their time doing decoding [env: DECODE_LENGTH=] [default: 8]\n",
30
+ " \u001b[1m-r\u001b[0m, \u001b[1m--runs\u001b[0m <RUNS>\n",
31
+ " How many runs should we average from [env: RUNS=] [default: 10]\n",
32
+ " \u001b[1m-w\u001b[0m, \u001b[1m--warmups\u001b[0m <WARMUPS>\n",
33
+ " Number of warmup cycles [env: WARMUPS=] [default: 1]\n",
34
+ " \u001b[1m-m\u001b[0m, \u001b[1m--master-shard-uds-path\u001b[0m <MASTER_SHARD_UDS_PATH>\n",
35
+ " The location of the grpc socket. This benchmark tool bypasses the router completely and directly talks to the gRPC processes [env: MASTER_SHARD_UDS_PATH=] [default: /tmp/text-generation-server-0]\n",
36
+ " \u001b[1m--temperature\u001b[0m <TEMPERATURE>\n",
37
+ " Generation parameter in case you want to specifically test/debug particular decoding strategies, for full doc refer to the `text-generation-server` [env: TEMPERATURE=]\n",
38
+ " \u001b[1m--top-k\u001b[0m <TOP_K>\n",
39
+ " Generation parameter in case you want to specifically test/debug particular decoding strategies, for full doc refer to the `text-generation-server` [env: TOP_K=]\n",
40
+ " \u001b[1m--top-p\u001b[0m <TOP_P>\n",
41
+ " Generation parameter in case you want to specifically test/debug particular decoding strategies, for full doc refer to the `text-generation-server` [env: TOP_P=]\n",
42
+ " \u001b[1m--typical-p\u001b[0m <TYPICAL_P>\n",
43
+ " Generation parameter in case you want to specifically test/debug particular decoding strategies, for full doc refer to the `text-generation-server` [env: TYPICAL_P=]\n",
44
+ " \u001b[1m--repetition-penalty\u001b[0m <REPETITION_PENALTY>\n",
45
+ " Generation parameter in case you want to specifically test/debug particular decoding strategies, for full doc refer to the `text-generation-server` [env: REPETITION_PENALTY=]\n",
46
+ " \u001b[1m--frequency-penalty\u001b[0m <FREQUENCY_PENALTY>\n",
47
+ " Generation parameter in case you want to specifically test/debug particular decoding strategies, for full doc refer to the `text-generation-server` [env: FREQUENCY_PENALTY=]\n",
48
+ " \u001b[1m--watermark\u001b[0m\n",
49
+ " Generation parameter in case you want to specifically test/debug particular decoding strategies, for full doc refer to the `text-generation-server` [env: WATERMARK=]\n",
50
+ " \u001b[1m--do-sample\u001b[0m\n",
51
+ " Generation parameter in case you want to specifically test/debug particular decoding strategies, for full doc refer to the `text-generation-server` [env: DO_SAMPLE=]\n",
52
+ " \u001b[1m--top-n-tokens\u001b[0m <TOP_N_TOKENS>\n",
53
+ " Generation parameter in case you want to specifically test/debug particular decoding strategies, for full doc refer to the `text-generation-server` [env: TOP_N_TOKENS=]\n",
54
+ " \u001b[1m-h\u001b[0m, \u001b[1m--help\u001b[0m\n",
55
+ " Print help (see more with '--help')\n",
56
+ " \u001b[1m-V\u001b[0m, \u001b[1m--version\u001b[0m\n",
57
+ " Print version\n"
58
+ ]
59
+ }
60
+ ],
61
+ "source": [
62
+ "!text-generation-benchmark -h"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "execution_count": null,
68
+ "id": "c8afc9d5-f624-4d7f-a64f-08af02a4aaff",
69
+ "metadata": {},
70
+ "outputs": [],
71
+ "source": [
72
+ "!text-generation-benchmark \\\n",
73
+ "--tokenizer-name astronomer/Llama-3-8B-Instruct-GPTQ-8-Bit \\\n",
74
+ "--sequence-length 3000 \\\n",
75
+ "--decode-length 300 \\\n",
76
+ "--batch-size 1 \\\n",
77
+ "--batch-size 2 \\\n",
78
+ "--batch-size 3 \\\n",
79
+ "--batch-size 4 \\\n",
80
+ "--batch-size 5"
81
+ ]
82
+ }
83
+ ],
84
+ "metadata": {
85
+ "kernelspec": {
86
+ "display_name": "Python 3 (ipykernel)",
87
+ "language": "python",
88
+ "name": "python3"
89
+ },
90
+ "language_info": {
91
+ "codemirror_mode": {
92
+ "name": "ipython",
93
+ "version": 3
94
+ },
95
+ "file_extension": ".py",
96
+ "mimetype": "text/x-python",
97
+ "name": "python",
98
+ "nbconvert_exporter": "python",
99
+ "pygments_lexer": "ipython3",
100
+ "version": "3.9.5"
101
+ }
102
+ },
103
+ "nbformat": 4,
104
+ "nbformat_minor": 5
105
+ }
notebooks/TGI-launcher.ipynb ADDED
The diff for this file is too large to render. See raw diff