derek-thomas HF staff commited on
Commit
0e1f8ee
1 Parent(s): 30293f6

Adding notebooks

Browse files
Files changed (2) hide show
  1. 01-tgi-ie-benchmark.ipynb +262 -0
  2. 02-tgi-plots.ipynb +167 -0
01-tgi-ie-benchmark.ipynb ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "73b1aa22-a1e3-4a1e-9dd2-042ab0f5939a",
7
+ "metadata": {
8
+ "tags": []
9
+ },
10
+ "outputs": [],
11
+ "source": [
12
+ "import sys\n",
13
+ "import json\n",
14
+ "from getpass import getpass\n",
15
+ "import subprocess\n",
16
+ "import os\n",
17
+ "from datetime import datetime\n",
18
+ "import pandas as pd\n",
19
+ "import numpy as np\n",
20
+ "from huggingface_hub import notebook_login, create_inference_endpoint, list_inference_endpoints, whoami, get_inference_endpoint, get_token\n",
21
+ "from pathlib import Path\n",
22
+ "from tqdm.notebook import tqdm"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": null,
28
+ "id": "772897cb-c2b1-4f9a-8143-ad64aed40b5b",
29
+ "metadata": {
30
+ "tags": []
31
+ },
32
+ "outputs": [],
33
+ "source": [
34
+ "notebook_login()"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": null,
40
+ "id": "8f951213-46a1-4db9-be2c-51c2291ecdc2",
41
+ "metadata": {
42
+ "tags": []
43
+ },
44
+ "outputs": [],
45
+ "source": [
46
+ "proj_dir = Path.cwd().parent\n",
47
+ "print(proj_dir)\n",
48
+ "LLMPerf_path = proj_dir/'llmperf'"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "markdown",
53
+ "id": "267ea96b-b756-4e16-b41a-fee2119edf76",
54
+ "metadata": {
55
+ "tags": []
56
+ },
57
+ "source": [
58
+ "# Config"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": null,
64
+ "id": "2d3341f2-217e-42a5-89fb-1653fd418c48",
65
+ "metadata": {
66
+ "tags": []
67
+ },
68
+ "outputs": [],
69
+ "source": [
70
+ "# Endpoint\n",
71
+ "ENDPOINT_NAME=\"gorgias-benchmark-sp\"\n",
72
+ "NAMESPACE = 'hf-test-lab'\n",
73
+ "MODEL = 'meta-llama/Meta-Llama-3-8B-Instruct'\n",
74
+ "INSTANCE_TYPE = 'nvidia-a100_2'\n",
75
+ "\n",
76
+ "# Simulation\n",
77
+ "RESULTS_DIR = proj_dir/'tgi_bench_results'/INSTANCE_TYPE\n",
78
+ "tgi_bss = [16, 24, 32, 40, 48, 56, 64]"
79
+ ]
80
+ },
81
+ {
82
+ "cell_type": "markdown",
83
+ "id": "f6bbb792-b168-42b8-bff1-c6ea9f6daf79",
84
+ "metadata": {},
85
+ "source": [
86
+ "# Endpoint setup"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": null,
92
+ "id": "ae923833-8ca1-4d16-85be-a78ffb386c43",
93
+ "metadata": {
94
+ "tags": []
95
+ },
96
+ "outputs": [],
97
+ "source": [
98
+ "def create_endpoint(MAX_BATCH_SIZE, name, instance_type):\n",
99
+ " try:\n",
100
+ " endpoint = get_inference_endpoint(name=name, namespace=NAMESPACE)\n",
101
+ " endpoint.wait()\n",
102
+ " return endpoint\n",
103
+ " except:\n",
104
+ " pass\n",
105
+ " try:\n",
106
+ " endpoint = create_inference_endpoint(\n",
107
+ " name,\n",
108
+ " repository=MODEL,\n",
109
+ " task=\"text-generation\",\n",
110
+ " framework=\"pytorch\",\n",
111
+ " region=\"us-east-1\",\n",
112
+ " vendor=\"aws\",\n",
113
+ " accelerator=\"gpu\",\n",
114
+ " instance_size=\"x1\",\n",
115
+ " instance_type='nvidia-a100',\n",
116
+ " min_replica=0,\n",
117
+ " max_replica=1,\n",
118
+ " namespace=NAMESPACE,\n",
119
+ " custom_image={\n",
120
+ " \"health_route\": \"/health\",\n",
121
+ " \"env\": {\n",
122
+ " \"MAX_INPUT_LENGTH\": \"3050\",\n",
123
+ " \"MAX_TOTAL_TOKENS\": \"3300\",\n",
124
+ " \"MAX_BATCH_SIZE\": f\"{MAX_BATCH_SIZE}\",\n",
125
+ " \"HF_TOKEN\": get_token(),\n",
126
+ " \"MODEL_ID\": \"/repository\",\n",
127
+ " },\n",
128
+ " \"url\": \"ghcr.io/huggingface/text-generation-inference:2.0.4\",\n",
129
+ " },\n",
130
+ " type=\"protected\",\n",
131
+ " )\n",
132
+ " endpoint.wait()\n",
133
+ " except Exception as create_error:\n",
134
+ " print(f\"Failed to create inference endpoint: {str(create_error)}\")\n",
135
+ " return None\n",
136
+ "\n",
137
+ " return endpoint"
138
+ ]
139
+ },
140
+ {
141
+ "cell_type": "code",
142
+ "execution_count": null,
143
+ "id": "491b82b3-4db8-4409-85ce-7c003a6c2f6f",
144
+ "metadata": {
145
+ "tags": []
146
+ },
147
+ "outputs": [],
148
+ "source": [
149
+ "def run_command(batch_size, endpoint, tgi_bs):\n",
150
+ " prefix = f'tgibs_{tgi_bs}__bs_{batch_size}'\n",
151
+ " vu = batch_size\n",
152
+ "\n",
153
+ " # Set environment variables\n",
154
+ " env = os.environ.copy()\n",
155
+ " env['HUGGINGFACE_API_BASE'] = endpoint.url\n",
156
+ " env['HUGGINGFACE_API_KEY'] = get_token()\n",
157
+ " # Convert pathlib.Path to string and append to PYTHONPATH\n",
158
+ " env['PYTHONPATH'] = str(LLMPerf_path) + (os.pathsep + env.get('PYTHONPATH', ''))\n",
159
+ "\n",
160
+ " # Define the benchmark script path\n",
161
+ " benchmark_script = str(LLMPerf_path / \"token_benchmark_ray.py\")\n",
162
+ "\n",
163
+ " if not os.path.isfile(benchmark_script):\n",
164
+ " print(f\"LLMPerf script not found at {benchmark_script}, please ensure the path is correct.\")\n",
165
+ " return \"Script not found\", False\n",
166
+ "\n",
167
+ " # Calculate the max number of completed requests\n",
168
+ " max_requests = vu * 8\n",
169
+ "\n",
170
+ " # Generate the results directory name\n",
171
+ " date_str = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')\n",
172
+ " results_dir = RESULTS_DIR / f\"{date_str}_{prefix}\"\n",
173
+ "\n",
174
+ " # Construct the command to run the benchmark script\n",
175
+ " command = [\n",
176
+ " \"python\", benchmark_script,\n",
177
+ " \"--model\", f\"huggingface/{MODEL}\",\n",
178
+ " \"--mean-input-tokens\", \"3000\",\n",
179
+ " \"--stddev-input-tokens\", \"10\",\n",
180
+ " \"--mean-output-tokens\", \"240\",\n",
181
+ " \"--stddev-output-tokens\", \"5\",\n",
182
+ " \"--max-num-completed-requests\", str(min(max_requests, 1500)),\n",
183
+ " \"--timeout\", \"7200\",\n",
184
+ " \"--num-concurrent-requests\", str(vu),\n",
185
+ " \"--results-dir\", str(results_dir),\n",
186
+ " \"--llm-api\", \"litellm\",\n",
187
+ " \"--additional-sampling-params\", '{}'\n",
188
+ " ]\n",
189
+ "\n",
190
+ " # Run the command with the modified environment\n",
191
+ " try:\n",
192
+ " result = subprocess.check_output(command, stderr=subprocess.STDOUT, env=env).decode('utf-8')\n",
193
+ " return result, True\n",
194
+ " except subprocess.CalledProcessError as e:\n",
195
+ " print(f\"Error with batch size {batch_size}: {e.output.decode()}\")\n",
196
+ " return e.output.decode(), False\n",
197
+ "\n",
198
+ "def find_max_working_batch_size(endpoint, tgi_bs):\n",
199
+ " batch_sizes = [8, 16, 32, 64, 128, 256]\n",
200
+ " max_working = None\n",
201
+ " for size in tqdm(batch_sizes):\n",
202
+ " tqdm.write(f\"Running: TGIBS {tgi_bs} Client Requests {size}\")\n",
203
+ " output, success = run_command(size, endpoint, tgi_bs)\n",
204
+ " if success:\n",
205
+ " max_working = size\n",
206
+ " else:\n",
207
+ " break\n",
208
+ " if max_working is None:\n",
209
+ " return \"No working batch size found in the provided list\"\n",
210
+ " return max_working"
211
+ ]
212
+ },
213
+ {
214
+ "cell_type": "code",
215
+ "execution_count": null,
216
+ "id": "70a11c08-0bea-43d6-85eb-ef014473c9f1",
217
+ "metadata": {
218
+ "tags": []
219
+ },
220
+ "outputs": [],
221
+ "source": [
222
+ "for tgi_bs in tqdm(tgi_bss):\n",
223
+ " name = f\"{ENDPOINT_NAME}--tgibs-{tgi_bs}\"\n",
224
+ " endpoint = create_endpoint(MAX_BATCH_SIZE=tgi_bs, name=name, instance_type=INSTANCE_TYPE) \n",
225
+ " endpoint.wait()\n",
226
+ " tqdm.write(f\"Endpoint Created: {name}\")\n",
227
+ " max_batch_size = find_max_working_batch_size(endpoint=endpoint, tgi_bs=tgi_bs)\n",
228
+ " endpoint.delete()\n",
229
+ " tqdm.write(f\"Endpoint Deleted: {name}\")"
230
+ ]
231
+ },
232
+ {
233
+ "cell_type": "code",
234
+ "execution_count": null,
235
+ "id": "25ef390c-10fe-4466-b8fd-1c01730205d2",
236
+ "metadata": {},
237
+ "outputs": [],
238
+ "source": []
239
+ }
240
+ ],
241
+ "metadata": {
242
+ "kernelspec": {
243
+ "display_name": "Python 3 (ipykernel)",
244
+ "language": "python",
245
+ "name": "python3"
246
+ },
247
+ "language_info": {
248
+ "codemirror_mode": {
249
+ "name": "ipython",
250
+ "version": 3
251
+ },
252
+ "file_extension": ".py",
253
+ "mimetype": "text/x-python",
254
+ "name": "python",
255
+ "nbconvert_exporter": "python",
256
+ "pygments_lexer": "ipython3",
257
+ "version": "3.10.14"
258
+ }
259
+ },
260
+ "nbformat": 4,
261
+ "nbformat_minor": 5
262
+ }
02-tgi-plots.ipynb ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "61d4649c-a8ca-494d-8c11-e2aca8faea64",
7
+ "metadata": {
8
+ "tags": []
9
+ },
10
+ "outputs": [],
11
+ "source": [
12
+ "from pathlib import Path\n",
13
+ "import plotly.graph_objects as go\n",
14
+ "\n",
15
+ "proj_dir = Path.cwd().parent\n",
16
+ "proj_dir"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "id": "a59f2e07-2505-4ad3-978d-2f2a8d4c7f16",
23
+ "metadata": {
24
+ "tags": []
25
+ },
26
+ "outputs": [],
27
+ "source": [
28
+ "import os\n",
29
+ "import json\n",
30
+ "import pandas as pd\n",
31
+ "\n",
32
+ "# Define the directory path where your files are located\n",
33
+ "dir_path = proj_dir/'tgi_bench_results/'\n",
34
+ "\n",
35
+ "\n",
36
+ "def build_df():\n",
37
+ " # Initialize an empty list to store the dataframes\n",
38
+ " dfs = []\n",
39
+ "\n",
40
+ " # Iterate through the files in the directory\n",
41
+ " for tgibs_folder in dir_path.glob(\"*/*_tgibs_*\"):\n",
42
+ " # Check if the file matches the pattern *_summary.json\n",
43
+ " summary_file = list(tgibs_folder.glob(\"*_summary.json\"))[0]\n",
44
+ " # Extract the tgibs value from the filename\n",
45
+ " hw = tgibs_folder.parts[-2]\n",
46
+ " tgibs_value = tgibs_folder.name.split('_tgibs_')[1].split('__')[0]\n",
47
+ "\n",
48
+ " # Load the JSON file\n",
49
+ " with open(summary_file, 'r') as f:\n",
50
+ " data = json.load(f)\n",
51
+ "\n",
52
+ " # Convert the JSON data to a pandas dataframe\n",
53
+ " df = pd.DataFrame([data])\n",
54
+ "\n",
55
+ " # Add a column with the tgibs value\n",
56
+ " df['tgibs'] = int(tgibs_value)\n",
57
+ " df['hw'] = hw\n",
58
+ " df['id'] = f\"{hw}_{tgibs_value}\"\n",
59
+ "\n",
60
+ " # Append the dataframe to the list\n",
61
+ " dfs.append(df)\n",
62
+ " df = pd.concat(dfs, ignore_index=True)\n",
63
+ " df = df.sort_values(by=['tgibs', 'num_concurrent_requests'], ascending=[True, True])\n",
64
+ " return df"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": null,
70
+ "id": "d8508fb9-fa31-4e23-80c1-e77a56d3775e",
71
+ "metadata": {
72
+ "tags": []
73
+ },
74
+ "outputs": [],
75
+ "source": [
76
+ "df = build_df()\n",
77
+ "\n",
78
+ "# Create a figure\n",
79
+ "fig = go.Figure()\n",
80
+ "\n",
81
+ "# Group the dataframe by batch_size\n",
82
+ "grouped_df = df.groupby('id')\n",
83
+ "\n",
84
+ "# List of specific batch_sizes to label\n",
85
+ "label_batch_sizes = ['nvidia-a100_8', 'nvidia-h100_8', 'nvidia-a100_8', 'nvidia-h100-fp8_8', 'nvidia-a100_medusa_8']\n",
86
+ "\n",
87
+ "# Iterate over each group\n",
88
+ "for batch_size, group in grouped_df:\n",
89
+ " # Add a line to the figure\n",
90
+ " fig.add_trace(go.Scatter(\n",
91
+ " x=group['results_end_to_end_latency_s_mean'],\n",
92
+ " y=group['results_num_completed_requests_per_min'],\n",
93
+ " mode='lines+markers',\n",
94
+ " name=f\"Batch Size: {batch_size}\", # Formatting batch size in the legend\n",
95
+ " hovertemplate=(\n",
96
+ " f\"<b>Batch Size: {batch_size}</b><br>\"\n",
97
+ " \"VU: %{text}<br>\"\n",
98
+ " \"Latency: %{x:.2f}s<br>\"\n",
99
+ " \"Throughput: %{y:.2f} reqs/min\"\n",
100
+ " ) + \"<extra></extra>\",\n",
101
+ " text=[f\"{v} VU\" for v in group['num_concurrent_requests']] # This will only be visible on hover\n",
102
+ " ))\n",
103
+ "\n",
104
+ " # Optionally add annotations only for the first point in the specified batch sizes\n",
105
+ " if batch_size in label_batch_sizes:\n",
106
+ " fig.add_annotation(\n",
107
+ " x=group['results_end_to_end_latency_s_mean'].iloc[0],\n",
108
+ " y=group['results_num_completed_requests_per_min'].iloc[0],\n",
109
+ " text=f'{batch_size[:-2].replace(\"nvidia-\", \"\")}',\n",
110
+ " showarrow=False,\n",
111
+ " ax=0,\n",
112
+ " # ay=90, # Offset to move the text down\n",
113
+ " xanchor='center',\n",
114
+ " yanchor='top'\n",
115
+ " )\n",
116
+ "\n",
117
+ "# Update layout for the figure\n",
118
+ "fig.update_layout(\n",
119
+ " title_text=\"Requests Throughput vs Latency by Batch Size\",\n",
120
+ " xaxis_title=\"End to End Latency (seconds)\",\n",
121
+ " yaxis_title=\"Requests/min\",\n",
122
+ " showlegend=True,\n",
123
+ ")\n",
124
+ "\n",
125
+ "# Show the figure\n",
126
+ "fig.show()"
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "code",
131
+ "execution_count": null,
132
+ "id": "9d2719fe-b0b5-400f-83a0-7eaffd8f2254",
133
+ "metadata": {},
134
+ "outputs": [],
135
+ "source": []
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": null,
140
+ "id": "b2472ada-8215-45cb-9efb-b094f02bb416",
141
+ "metadata": {},
142
+ "outputs": [],
143
+ "source": []
144
+ }
145
+ ],
146
+ "metadata": {
147
+ "kernelspec": {
148
+ "display_name": "Python 3 (ipykernel)",
149
+ "language": "python",
150
+ "name": "python3"
151
+ },
152
+ "language_info": {
153
+ "codemirror_mode": {
154
+ "name": "ipython",
155
+ "version": 3
156
+ },
157
+ "file_extension": ".py",
158
+ "mimetype": "text/x-python",
159
+ "name": "python",
160
+ "nbconvert_exporter": "python",
161
+ "pygments_lexer": "ipython3",
162
+ "version": "3.10.14"
163
+ }
164
+ },
165
+ "nbformat": 4,
166
+ "nbformat_minor": 5
167
+ }