Spaces:
Runtime error
Runtime error
File size: 7,283 Bytes
76398c6 a0b9dac 76398c6 992ded3 76398c6 992ded3 76398c6 fddae32 76398c6 d37299b ae0011e d37299b ae0011e d37299b ae0011e 76398c6 0cc3d3a 76398c6 e8be103 76398c6 a0b9dac fddae32 a0b9dac 76398c6 d37299b 79399de 161a324 79399de 76398c6 161a324 e8be103 161a324 76398c6 2d9aa2d 76398c6 a0b9dac 74c26d6 76398c6 d37299b 76398c6 0cc3d3a ae0011e d37299b ae0011e d37299b 76398c6 d37299b 76398c6 d37299b 76398c6 ae0011e d37299b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import streamlit as st
import pandas as pd
from utils import extract_from_url, get_model, calculate_memory
import plotly.express as px
import numpy as np
import gc
st.set_page_config(page_title='Can you run it? LLM version', layout="wide", initial_sidebar_state="expanded")
st.title("Can you run it? LLM version")
percentage_width_main = 80
st.markdown(
f"""<style>
.appview-container .main .block-container{{
max-width: {percentage_width_main}%;}}
</style>
""",
unsafe_allow_html=True,
)
@st.cache_resource
def get_gpu_specs():
return pd.read_csv("data/gpu_specs.csv")
@st.cache_resource
def get_mistralai_table():
model = get_model("mistralai/Mistral-7B-v0.1", library="transformers", access_token="")
return calculate_memory(model, ["float32", "float16/bfloat16", "int8", "int4"])
def show_gpu_info(info, trainable_params=0, vendor=""):
for var in ['Inference', 'Full Training Adam', 'LoRa Fine-tuning']:
_info = info.loc[var]
if vendor != "Apple":
if _info['Number of GPUs'] >= 3:
func = st.error
icon = "⛔"
elif _info['Number of GPUs'] == 2:
func = st.warning
icon = "⚠️"
else:
func = st.success
icon = "✅"
msg = f"You require **{_info['Number of GPUs']}** GPUs for **{var}**"
if var == 'LoRa Fine-tuning':
msg += f" ({trainable_params}%)"
else:
if _info['Number of GPUs']==1:
msg = f"You can run **{var}**"
func = st.success
icon = "✅"
else:
msg = f"You cannot run **{var}**"
func = st.error
icon = "⛔"
func(msg, icon=icon)
def get_name(index):
row = gpu_specs.iloc[index]
return f"{row['Product Name']} ({row['RAM (GB)']} GB, {row['Year']})"
def custom_ceil(a, precision=0):
return np.round(a + 0.5 * 10**(-precision), precision)
gpu_specs = get_gpu_specs()
_, col, _ = st.columns([1,3,1])
with col.expander("Information", expanded=True):
st.markdown("""- GPU information comes from [TechPowerUp GPU Specs](https://www.techpowerup.com/gpu-specs/)
- Mainly based on [Model Memory Calculator by hf-accelerate](https://huggingface.co/spaces/hf-accelerate/model-memory-usage)
using `transformers` library
- Inference is calculated following [EleutherAI Transformer Math 101](https://blog.eleuther.ai/transformer-math/),
where is estimated as """)
st.latex(r"""\text{Memory}_\text{Inference} \approx \text{Model Size} \times 1.2""")
st.markdown("""- For LoRa Fine-tuning, I'm asuming a **16-bit** dtype of trainable parameters. The formula (in terms of GB) is""")
st.latex(r"\text{Memory}_\text{LoRa} \approx \text{Model Size} + \left(\text{ \# trainable Params}_\text{Billions}\times\frac{16}{8} \times 4\right) \times 1.2")
access_token = st.sidebar.text_input("Access token")
model_name = st.sidebar.text_input("Model name", value="mistralai/Mistral-7B-v0.1")
if not model_name:
st.info("Please enter a model name")
st.stop()
model_name = extract_from_url(model_name)
if model_name not in st.session_state:
if 'actual_model' in st.session_state:
del st.session_state[st.session_state['actual_model']]
del st.session_state['actual_model']
gc.collect()
if model_name == "mistralai/Mistral-7B-v0.1": # cache Mistral
st.session_state[model_name] = get_mistralai_table()
else:
model = get_model(model_name, library="transformers", access_token=access_token)
st.session_state[model_name] = calculate_memory(model, ["float32", "float16/bfloat16", "int8", "int4"])
del model
gc.collect()
st.session_state['actual_model'] = model_name
gpu_vendor = st.sidebar.selectbox("GPU Vendor", ["NVIDIA", "AMD", "Intel", "Apple"])
# year = st.sidebar.selectbox("Filter by Release Year", list(range(2014, 2024))[::-1], index=None)
gpu_info = gpu_specs[gpu_specs['Vendor'] == gpu_vendor].sort_values('Product Name')
# if year:
# gpu_info = gpu_info[gpu_info['Year'] == year]
min_ram = gpu_info['RAM (GB)'].min()
max_ram = gpu_info['RAM (GB)'].max()
ram = st.sidebar.slider("Filter by RAM (GB)", min_ram, max_ram, (10.0, 40.0), step=0.5)
gpu_info = gpu_info[gpu_info["RAM (GB)"].between(ram[0], ram[1])]
if len(gpu_info) == 0:
st.sidebar.error(f"**{gpu_vendor}** has no GPU in that RAM range")
st.stop()
gpu = st.sidebar.selectbox("GPU", gpu_info['Product Name'].index.tolist(), format_func=lambda x : gpu_specs.iloc[x]['Product Name'])
gpu_spec = gpu_specs.iloc[gpu]
gpu_spec.name = 'INFO'
lora_pct = st.sidebar.slider("LoRa % trainable parameters", 0.1, 100.0, 2.0, step=0.1)
st.sidebar.dataframe(gpu_spec.T.astype(str))
memory_table = pd.DataFrame(st.session_state[model_name]).set_index('dtype')
memory_table['LoRA Fine-Tuning (GB)'] = (memory_table["Total Size (GB)"] +
(memory_table["Parameters (Billion)"]* lora_pct/100 * (16/8)*4)) * 1.2
_memory_table = memory_table.copy()
memory_table = memory_table.round(2).T
_memory_table /= gpu_spec['RAM (GB)']
_memory_table = _memory_table.apply(np.ceil).astype(int).drop(columns=['Parameters (Billion)', 'Total Size (GB)'])
_memory_table.columns = ['Inference', 'Full Training Adam', 'LoRa Fine-tuning']
_memory_table = _memory_table.stack().reset_index()
_memory_table.columns = ['dtype', 'Variable', 'Number of GPUs']
col1, col2 = st.columns([1,1.3])
if gpu_vendor == "Apple":
col.warning("""For M1/M2 Apple chips, PyTorch uses [Metal Performance Shaders (MPS)](https://huggingface.co/docs/accelerate/usage_guides/mps) as backend.\\
Remember that Apple M1/M2 chips share memory between CPU and GPU.""", icon="⚠️")
with col1:
st.write(f"#### [{model_name}](https://huggingface.co/{model_name}) ({custom_ceil(memory_table.iloc[3,0],1):.1f}B)")
dtypes = memory_table.columns.tolist()[::-1]
tabs = st.tabs(dtypes)
for dtype, tab in zip(dtypes, tabs):
with tab:
if dtype in ["int4", "int8"]:
_dtype = dtype.replace("int", "")
st.markdown(f"`int{_dtype}` refers to models in `GPTQ-{_dtype}bit`, `AWQ-{_dtype}bit` or `Q{_dtype}_0 GGUF/GGML`")
info = _memory_table[_memory_table['dtype'] == dtype].set_index('Variable')
show_gpu_info(info, lora_pct, gpu_vendor)
st.write(memory_table.iloc[[0, 1, 2, 4]])
with col2:
extra = ""
if gpu_vendor == "Apple":
st.warning("This graph is irrelevant for M1/M2 chips as they can't run in parallel.", icon="⚠️")
extra = "⚠️"
num_colors= 4
colors = [px.colors.sequential.RdBu[int(i*(len(px.colors.sequential.RdBu)-1)/(num_colors-1))] for i in range(num_colors)]
fig = px.bar(_memory_table, x='Variable', y='Number of GPUs', color='dtype', barmode='group', color_discrete_sequence=colors)
fig.update_layout(title=dict(text=f"{extra} Number of GPUs required for<br> {get_name(gpu)}", font=dict(size=25))
, xaxis_tickfont_size=14, yaxis_tickfont_size=16, yaxis_dtick='1')
st.plotly_chart(fig, use_container_width=True)
|