derek-thomas
/

tgi-notebooks-optimization

Model card Files Files and versions Community

derek-thomas HF staff commited on Jun 1

Commit

0e1f8ee

•

1 Parent(s): 30293f6

Adding notebooks

Browse files

Files changed (2) hide show

01-tgi-ie-benchmark.ipynb +262 -0
02-tgi-plots.ipynb +167 -0

01-tgi-ie-benchmark.ipynb ADDED Viewed

	@@ -0,0 +1,262 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "73b1aa22-a1e3-4a1e-9dd2-042ab0f5939a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import json\n",
+    "from getpass import getpass\n",
+    "import subprocess\n",
+    "import os\n",
+    "from datetime import datetime\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from huggingface_hub import notebook_login, create_inference_endpoint, list_inference_endpoints, whoami, get_inference_endpoint, get_token\n",
+    "from pathlib import Path\n",
+    "from tqdm.notebook import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "772897cb-c2b1-4f9a-8143-ad64aed40b5b",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8f951213-46a1-4db9-be2c-51c2291ecdc2",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "proj_dir = Path.cwd().parent\n",
+    "print(proj_dir)\n",
+    "LLMPerf_path = proj_dir/'llmperf'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "267ea96b-b756-4e16-b41a-fee2119edf76",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2d3341f2-217e-42a5-89fb-1653fd418c48",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Endpoint\n",
+    "ENDPOINT_NAME=\"gorgias-benchmark-sp\"\n",
+    "NAMESPACE = 'hf-test-lab'\n",
+    "MODEL = 'meta-llama/Meta-Llama-3-8B-Instruct'\n",
+    "INSTANCE_TYPE = 'nvidia-a100_2'\n",
+    "\n",
+    "# Simulation\n",
+    "RESULTS_DIR = proj_dir/'tgi_bench_results'/INSTANCE_TYPE\n",
+    "tgi_bss = [16, 24, 32, 40, 48, 56, 64]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f6bbb792-b168-42b8-bff1-c6ea9f6daf79",
+   "metadata": {},
+   "source": [
+    "# Endpoint setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ae923833-8ca1-4d16-85be-a78ffb386c43",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def create_endpoint(MAX_BATCH_SIZE, name, instance_type):\n",
+    "    try:\n",
+    "        endpoint = get_inference_endpoint(name=name, namespace=NAMESPACE)\n",
+    "        endpoint.wait()\n",
+    "        return endpoint\n",
+    "    except:\n",
+    "        pass\n",
+    "    try:\n",
+    "        endpoint = create_inference_endpoint(\n",
+    "            name,\n",
+    "            repository=MODEL,\n",
+    "            task=\"text-generation\",\n",
+    "            framework=\"pytorch\",\n",
+    "            region=\"us-east-1\",\n",
+    "            vendor=\"aws\",\n",
+    "            accelerator=\"gpu\",\n",
+    "            instance_size=\"x1\",\n",
+    "            instance_type='nvidia-a100',\n",
+    "            min_replica=0,\n",
+    "            max_replica=1,\n",
+    "            namespace=NAMESPACE,\n",
+    "            custom_image={\n",
+    "                \"health_route\": \"/health\",\n",
+    "                \"env\": {\n",
+    "                    \"MAX_INPUT_LENGTH\": \"3050\",\n",
+    "                    \"MAX_TOTAL_TOKENS\": \"3300\",\n",
+    "                    \"MAX_BATCH_SIZE\": f\"{MAX_BATCH_SIZE}\",\n",
+    "                    \"HF_TOKEN\": get_token(),\n",
+    "                    \"MODEL_ID\": \"/repository\",\n",
+    "                },\n",
+    "                \"url\": \"ghcr.io/huggingface/text-generation-inference:2.0.4\",\n",
+    "            },\n",
+    "            type=\"protected\",\n",
+    "        )\n",
+    "        endpoint.wait()\n",
+    "    except Exception as create_error:\n",
+    "        print(f\"Failed to create inference endpoint: {str(create_error)}\")\n",
+    "        return None\n",
+    "\n",
+    "    return endpoint"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "491b82b3-4db8-4409-85ce-7c003a6c2f6f",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def run_command(batch_size, endpoint, tgi_bs):\n",
+    "    prefix = f'tgibs_{tgi_bs}__bs_{batch_size}'\n",
+    "    vu = batch_size\n",
+    "\n",
+    "    # Set environment variables\n",
+    "    env = os.environ.copy()\n",
+    "    env['HUGGINGFACE_API_BASE'] = endpoint.url\n",
+    "    env['HUGGINGFACE_API_KEY'] = get_token()\n",
+    "    # Convert pathlib.Path to string and append to PYTHONPATH\n",
+    "    env['PYTHONPATH'] = str(LLMPerf_path) + (os.pathsep + env.get('PYTHONPATH', ''))\n",
+    "\n",
+    "    # Define the benchmark script path\n",
+    "    benchmark_script = str(LLMPerf_path / \"token_benchmark_ray.py\")\n",
+    "\n",
+    "    if not os.path.isfile(benchmark_script):\n",
+    "        print(f\"LLMPerf script not found at {benchmark_script}, please ensure the path is correct.\")\n",
+    "        return \"Script not found\", False\n",
+    "\n",
+    "    # Calculate the max number of completed requests\n",
+    "    max_requests = vu * 8\n",
+    "\n",
+    "    # Generate the results directory name\n",
+    "    date_str = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')\n",
+    "    results_dir = RESULTS_DIR / f\"{date_str}_{prefix}\"\n",
+    "\n",
+    "    # Construct the command to run the benchmark script\n",
+    "    command = [\n",
+    "        \"python\", benchmark_script,\n",
+    "        \"--model\", f\"huggingface/{MODEL}\",\n",
+    "        \"--mean-input-tokens\", \"3000\",\n",
+    "        \"--stddev-input-tokens\", \"10\",\n",
+    "        \"--mean-output-tokens\", \"240\",\n",
+    "        \"--stddev-output-tokens\", \"5\",\n",
+    "        \"--max-num-completed-requests\", str(min(max_requests, 1500)),\n",
+    "        \"--timeout\", \"7200\",\n",
+    "        \"--num-concurrent-requests\", str(vu),\n",
+    "        \"--results-dir\", str(results_dir),\n",
+    "        \"--llm-api\", \"litellm\",\n",
+    "        \"--additional-sampling-params\", '{}'\n",
+    "    ]\n",
+    "\n",
+    "    # Run the command with the modified environment\n",
+    "    try:\n",
+    "        result = subprocess.check_output(command, stderr=subprocess.STDOUT, env=env).decode('utf-8')\n",
+    "        return result, True\n",
+    "    except subprocess.CalledProcessError as e:\n",
+    "        print(f\"Error with batch size {batch_size}: {e.output.decode()}\")\n",
+    "        return e.output.decode(), False\n",
+    "\n",
+    "def find_max_working_batch_size(endpoint, tgi_bs):\n",
+    "    batch_sizes = [8, 16, 32, 64, 128, 256]\n",
+    "    max_working = None\n",
+    "    for size in tqdm(batch_sizes):\n",
+    "        tqdm.write(f\"Running: TGIBS {tgi_bs} Client Requests {size}\")\n",
+    "        output, success = run_command(size, endpoint, tgi_bs)\n",
+    "        if success:\n",
+    "            max_working = size\n",
+    "        else:\n",
+    "            break\n",
+    "    if max_working is None:\n",
+    "        return \"No working batch size found in the provided list\"\n",
+    "    return max_working"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "70a11c08-0bea-43d6-85eb-ef014473c9f1",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "for tgi_bs in tqdm(tgi_bss):\n",
+    "    name = f\"{ENDPOINT_NAME}--tgibs-{tgi_bs}\"\n",
+    "    endpoint = create_endpoint(MAX_BATCH_SIZE=tgi_bs, name=name, instance_type=INSTANCE_TYPE) \n",
+    "    endpoint.wait()\n",
+    "    tqdm.write(f\"Endpoint Created: {name}\")\n",
+    "    max_batch_size = find_max_working_batch_size(endpoint=endpoint, tgi_bs=tgi_bs)\n",
+    "    endpoint.delete()\n",
+    "    tqdm.write(f\"Endpoint Deleted: {name}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "25ef390c-10fe-4466-b8fd-1c01730205d2",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

02-tgi-plots.ipynb ADDED Viewed

	@@ -0,0 +1,167 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "61d4649c-a8ca-494d-8c11-e2aca8faea64",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "import plotly.graph_objects as go\n",
+    "\n",
+    "proj_dir = Path.cwd().parent\n",
+    "proj_dir"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a59f2e07-2505-4ad3-978d-2f2a8d4c7f16",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "import pandas as pd\n",
+    "\n",
+    "# Define the directory path where your files are located\n",
+    "dir_path = proj_dir/'tgi_bench_results/'\n",
+    "\n",
+    "\n",
+    "def build_df():\n",
+    "    # Initialize an empty list to store the dataframes\n",
+    "    dfs = []\n",
+    "\n",
+    "    # Iterate through the files in the directory\n",
+    "    for tgibs_folder in dir_path.glob(\"*/*_tgibs_*\"):\n",
+    "        # Check if the file matches the pattern *_summary.json\n",
+    "        summary_file = list(tgibs_folder.glob(\"*_summary.json\"))[0]\n",
+    "        # Extract the tgibs value from the filename\n",
+    "        hw = tgibs_folder.parts[-2]\n",
+    "        tgibs_value = tgibs_folder.name.split('_tgibs_')[1].split('__')[0]\n",
+    "\n",
+    "        # Load the JSON file\n",
+    "        with open(summary_file, 'r') as f:\n",
+    "            data = json.load(f)\n",
+    "\n",
+    "        # Convert the JSON data to a pandas dataframe\n",
+    "        df = pd.DataFrame([data])\n",
+    "\n",
+    "        # Add a column with the tgibs value\n",
+    "        df['tgibs'] = int(tgibs_value)\n",
+    "        df['hw'] = hw\n",
+    "        df['id'] = f\"{hw}_{tgibs_value}\"\n",
+    "\n",
+    "        # Append the dataframe to the list\n",
+    "        dfs.append(df)\n",
+    "    df = pd.concat(dfs, ignore_index=True)\n",
+    "    df = df.sort_values(by=['tgibs', 'num_concurrent_requests'], ascending=[True, True])\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d8508fb9-fa31-4e23-80c1-e77a56d3775e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "df = build_df()\n",
+    "\n",
+    "# Create a figure\n",
+    "fig = go.Figure()\n",
+    "\n",
+    "# Group the dataframe by batch_size\n",
+    "grouped_df = df.groupby('id')\n",
+    "\n",
+    "# List of specific batch_sizes to label\n",
+    "label_batch_sizes = ['nvidia-a100_8', 'nvidia-h100_8', 'nvidia-a100_8', 'nvidia-h100-fp8_8', 'nvidia-a100_medusa_8']\n",
+    "\n",
+    "# Iterate over each group\n",
+    "for batch_size, group in grouped_df:\n",
+    "    # Add a line to the figure\n",
+    "    fig.add_trace(go.Scatter(\n",
+    "        x=group['results_end_to_end_latency_s_mean'],\n",
+    "        y=group['results_num_completed_requests_per_min'],\n",
+    "        mode='lines+markers',\n",
+    "        name=f\"Batch Size: {batch_size}\",  # Formatting batch size in the legend\n",
+    "        hovertemplate=(\n",
+    "            f\"<b>Batch Size: {batch_size}</b><br>\"\n",
+    "            \"VU: %{text}<br>\"\n",
+    "            \"Latency: %{x:.2f}s<br>\"\n",
+    "            \"Throughput: %{y:.2f} reqs/min\"\n",
+    "        ) + \"<extra></extra>\",\n",
+    "        text=[f\"{v} VU\" for v in group['num_concurrent_requests']]  # This will only be visible on hover\n",
+    "    ))\n",
+    "\n",
+    "    # Optionally add annotations only for the first point in the specified batch sizes\n",
+    "    if batch_size in label_batch_sizes:\n",
+    "        fig.add_annotation(\n",
+    "            x=group['results_end_to_end_latency_s_mean'].iloc[0],\n",
+    "            y=group['results_num_completed_requests_per_min'].iloc[0],\n",
+    "            text=f'{batch_size[:-2].replace(\"nvidia-\", \"\")}',\n",
+    "            showarrow=False,\n",
+    "            ax=0,\n",
+    "            # ay=90,  # Offset to move the text down\n",
+    "            xanchor='center',\n",
+    "            yanchor='top'\n",
+    "        )\n",
+    "\n",
+    "# Update layout for the figure\n",
+    "fig.update_layout(\n",
+    "    title_text=\"Requests Throughput vs Latency by Batch Size\",\n",
+    "    xaxis_title=\"End to End Latency (seconds)\",\n",
+    "    yaxis_title=\"Requests/min\",\n",
+    "    showlegend=True,\n",
+    ")\n",
+    "\n",
+    "# Show the figure\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d2719fe-b0b5-400f-83a0-7eaffd8f2254",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b2472ada-8215-45cb-9efb-b094f02bb416",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}