Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
"""LLM Training Cost Calculator App.ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/1iZpCUgC5T_ASnlDgMYm1n4RH8BZsm7sx | |
""" | |
# !pip install gradio | |
import gradio as gr | |
def estimate_training_cost(gpu_choice, precision, number_of_parameters, number_of_tokens, utilization_rate=0.5, overhead=1.10, cost_per_gpu_hour=1.85): | |
""" | |
Estimates the training cost of a large language model based on the selected GPU and precision. | |
Args: | |
- gpu_choice (str): The choice of GPU, e.g., 'A100 80GB PCIe', 'V100', etc. | |
- precision (str): The precision level for the GPU, e.g., 'bf16', 'tf32', 'tensor'. | |
- number_of_parameters (int): The number of parameters in the model. | |
- number_of_tokens (int): The number of tokens to train on. | |
- utilization_rate (float, optional): The utilization rate of the GPU (0 < utilization_rate ≤ 1). Default is 0.5 (50%). | |
- overhead (float, optional): Multiplier to account for overhead and additional costs (1 + overhead percentage). Default is 1.10 (10% overhead). | |
- cost_per_gpu_hour (float, optional): The cost per hour of using the GPU. Default is $1.85/hour. | |
Returns: | |
- float: The estimated total cost of training the model. | |
The function dynamically adjusts the GPU throughput based on the selected GPU and precision. The throughput values are predefined for each GPU and precision combination. This estimation assumes a linear scaling of training cost with the number of parameters and tokens. | |
""" | |
gpu_throughputs = { | |
'A100 80GB PCIe': {'bf16': 312e12, 'tf32': 156e12}, | |
'A100 80GB SXM': {'bf16': 624e12, 'tf32': 312e12}, | |
'V100': {'tensor': 130e12}, # Assuming only the deep learning performance for V100 | |
'H100 SXM': {'bf16': 1979e12, 'tf32': 989e12}, | |
'H100 PCIe': {'bf16': 1513e12, 'tf32': 756e12} | |
} | |
# Get the correct GPU throughput | |
gpu_throughput = gpu_throughputs[gpu_choice][precision] | |
# Calculate the total number of FLOPs required for training | |
total_flops = 6 * number_of_parameters * number_of_tokens | |
# Calculate the number of hours required on the selected GPU | |
gpu_hours = total_flops / (gpu_throughput * 3600) | |
# Adjust for the actual utilization of the GPUs | |
adjusted_gpu_hours = gpu_hours / utilization_rate | |
# Account for the overhead | |
actual_gpu_hours = adjusted_gpu_hours * overhead | |
# Calculate the total cost | |
total_cost = actual_gpu_hours * cost_per_gpu_hour | |
return total_cost | |
def gradio_interface(gpu_choice, precision, number_of_parameters, number_of_tokens, utilization_rate, overhead, cost_per_gpu_hour): | |
number_of_parameters = float(number_of_parameters) * 1e9 # Convert from billions to actual number | |
number_of_tokens = float(number_of_tokens) * 1e12 # Convert from trillions to actual number | |
utilization_rate = float(utilization_rate) | |
overhead = float(overhead) | |
cost_per_gpu_hour = float(cost_per_gpu_hour) | |
cost = estimate_training_cost(gpu_choice, precision, number_of_parameters, number_of_tokens, utilization_rate=utilization_rate, overhead=overhead, cost_per_gpu_hour=cost_per_gpu_hour) | |
return f"The estimated training cost is ${cost:,.2f}" | |
gpu_choices = ["A100 80GB PCIe", "A100 80GB SXM", "V100", "H100 SXM", "H100 PCIe"] | |
default_precisions = ['bf16', 'tf32', 'tensor', 'bf16', 'bf16'] # Default precision for each GPU | |
# Define the title and description for the Gradio app | |
title = "<h2 style='text-align: center;'>LLM Training Cost Calculator</h2>" | |
description = """ | |
<p style='text-align: center;'>Estimate the cost of training large language models (LLM). This tool helps you calculate the cost based on model parameters, tokens, and GPU selections with various precision options. Select a GPU and the precision level to get an accurate cost estimate.</p> | |
<p><strong>Available GPUs and Precisions:</strong></p> | |
<ul> | |
<li><strong>A100 80GB PCIe:</strong> Available precisions - BFLOAT16 (bf16), Tensor Float 32 (tf32).</li> | |
<li><strong>A100 80GB SXM:</strong> Available precisions - BFLOAT16 (bf16), Tensor Float 32 (tf32).</li> | |
<li><strong>V100:</strong> Uses Deep Learning performance with Tensor Cores (tensor) as the default and only precision.</li> | |
<li><strong>H100 SXM:</strong> Available precisions - BFLOAT16 (bf16), Tensor Float 32 (tf32).</li> | |
<li><strong>H100 PCIe:</strong> Available precisions - BFLOAT16 (bf16), Tensor Float 32 (tf32).</li> | |
</ul> | |
<p>The choice of GPU and precision impacts the throughput, affecting training time and cost. BFLOAT16 is generally faster and more cost-effective, while Tensor Float 32 offers higher precision. The V100 GPU is optimized for Deep Learning with Tensor Cores.</p> | |
<p style='text-align: center;'>We plan to extend this calculator to include calculating the cost of fine-tuning models using strategies like LoRA or QLoRA. Stay tuned for updates where you'll be able to input the model ID from the Hugging Face Hub, select the fine-tuning strategy, and specify quantization details if QLoRA is chosen.</p> | |
""" | |
iface = gr.Interface( | |
fn=gradio_interface, | |
inputs=[ | |
gr.Dropdown(choices=gpu_choices, label="Select GPU", value='A100 80GB PCIe'), | |
gr.Dropdown(choices=['bf16', 'tf32', 'tensor'], label="Select Precision", value='bf16'), | |
gr.Textbox(label="Number of Parameters (in billions)", value="70"), | |
gr.Textbox(label="Number of Tokens (in trillions)", value="2"), | |
gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.5, label="GPU Utilization Rate"), | |
gr.Slider(minimum=1.0, maximum=2.0, step=0.01, value=1.10, label="Overhead (1 + overhead percentage)"), | |
gr.Textbox(label="Cost per GPU Hour ($)", value="1.85") | |
], | |
outputs=[gr.Textbox(label="Estimated Training Cost")], | |
title=title, | |
description=description, | |
article="<p style='text-align: center;'>Developed with ❤️ by Elfilali Ali</p>" | |
) | |
iface.launch() | |