joaogante's picture
joaogante HF staff
shorter names
0910acf
raw
history blame
8.98 kB
import matplotlib
matplotlib.use('Agg')
import functools
import gradio as gr
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
FIGURE_PATH = "plt.png"
FIG_DPI = 300
def get_plot(task, gpu, omit_offload):
# slice the dataframe according to the inputs
df = pd.read_csv("data.csv")
df = df[df["task"] == task]
df = df[df["gpu"] == gpu]
if omit_offload == "Yes":
df = df[df["offload"] == 0]
# combine model name and dtype
df["model and dtype"] = df['model_name'].str.cat(df[['dtype']], sep=', ')
# fuse the two columns to be compared (original and assisted generation)
df = df.melt(
id_vars=["task", "gpu", "model and dtype", "offload"],
value_vars=["Greedy", "Assisted"],
var_name="generation_type",
value_name="generation_time",
)
g = sns.catplot(
data=df,
kind="bar",
x="model and dtype",
y="generation_time",
hue="generation_type",
palette={"Greedy": "blue", "Assisted": "orange"},
alpha=.9,
)
g.despine(left=True)
g.set_axis_labels("Model size and dtype", "Latency (ms/token)")
g.set_xticklabels(fontsize=7)
g.set_yticklabels(fontsize=7)
g.legend.set_title("Generation Type")
plt.setp(g._legend.get_texts(), fontsize='7') # for legend text
# Add the number to the top of each bar
ax = g.facet_axis(0, 0)
for i in ax.containers:
ax.bar_label(i, fontsize=7)
plt.savefig(FIGURE_PATH, dpi=FIG_DPI)
return FIGURE_PATH
demo = gr.Blocks()
with demo:
gr.Markdown(
"""
# Assisted Generation Benchmark
"""
)
# components shared across tabs
omit_offload_fn = functools.partial(
gr.Radio, ["Yes", "No"], value="No", label="Omit cases with memory offload?", interactive=True
)
def gpu_selector_fn(gpu_list):
return gr.Dropdown(
gpu_list, value=gpu_list[-1], label="GPU", interactive=True
)
with gr.Tabs():
with gr.TabItem("OPT: Open"):
plot_fn = functools.partial(get_plot, "OPT: Open Text Generation")
with gr.Row():
with gr.Column(scale=0.3):
gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"])
omit_offload = omit_offload_fn()
gr.Markdown(
"""
### Assistant Model
- `facebook/opt-125m`
### Model Names:
- 1.3B: `facebook/opt-1.3b`
- 6.7B: `facebook/opt-6.7b`
- 30B: `facebook/opt-30b`
- 66B: `facebook/opt-66b`
### Dataset used as input prompt:
- C4 (en, validation set)
"""
)
# Show plot when the gradio app is initialized
plot = gr.Image(value=plot_fn("A100 (80GB)", "No"))
# Update plot when any of the inputs change
plot_inputs = [gpu_selector, omit_offload]
gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
with gr.TabItem("OPT: Summarization"):
plot_fn = functools.partial(get_plot, "OPT: Summarization")
with gr.Row():
with gr.Column(scale=0.3):
gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"])
omit_offload = omit_offload_fn()
gr.Markdown(
"""
### Assistant Model
- `facebook/opt-125m`
### Model Names:
- 1.3B: `facebook/opt-1.3b`
- 6.7B: `facebook/opt-6.7b`
- 30B: `facebook/opt-30b`
- 66B: `facebook/opt-66b`
### Dataset used as input prompt:
- CNN Dailymail (3.0.0, validation set)
"""
)
# Show plot when the gradio app is initialized
plot = gr.Image(value=plot_fn("A100 (80GB)", "No"))
# Update plot when any of the inputs change
plot_inputs = [gpu_selector, omit_offload]
gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
with gr.TabItem("Whisper: ARS"):
plot_fn = functools.partial(get_plot, "Whisper: ARS")
with gr.Row():
with gr.Column(scale=0.3):
gpu_selector = gpu_selector_fn(["3090", "T4"])
omit_offload = omit_offload_fn()
gr.Markdown(
"""
### Assistant Model
- `openai/whisper-tiny`
### Model Names:
- large-v2: `openai/whisper-large-v2`
### Dataset used as input prompt:
- Librispeech ARS (clean, validation set)
"""
)
# Show plot when the gradio app is initialized
plot = gr.Image(value=plot_fn("T4", "No"))
# Update plot when any of the inputs change
plot_inputs = [gpu_selector, omit_offload]
gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
with gr.TabItem("CodeGen: Code"):
plot_fn = functools.partial(get_plot, "CodeGen: Code Generation")
with gr.Row():
with gr.Column(scale=0.3):
gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"])
omit_offload = omit_offload_fn()
gr.Markdown(
"""
### Assistant Model
- `Salesforce/codegen-350M-mono`
### Model Names:
- 2B: `Salesforce/codegen-2B-mono`
- 6B: `Salesforce/codegen-6B-mono`
- 16B: `Salesforce/codegen-16B-mono`
### Dataset used as input prompt:
- The Stack (python)
"""
)
# Show plot when the gradio app is initialized
plot = gr.Image(value=plot_fn("A100 (80GB)", "No"))
# Update plot when any of the inputs change
plot_inputs = [gpu_selector, omit_offload]
gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
with gr.TabItem("Flan-T5: Summarization"):
plot_fn = functools.partial(get_plot, "Flan-T5: Summarization")
with gr.Row():
with gr.Column(scale=0.3):
gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"])
omit_offload = omit_offload_fn()
gr.Markdown(
"""
### Assistant Model
- `google/flan-t5-small`
### Model Names:
- large: `google/flan-t5-large`
- xl: `google/flan-t5-xl`
- xxl: `google/flan-t5-xxl`
- ul2: `google/flan-ul2`
### Dataset used as input prompt:
- CNN Dailymail (3.0.0, validation set)
"""
)
# Show plot when the gradio app is initialized
plot = gr.Image(value=plot_fn("A100 (80GB)", "No"))
# Update plot when any of the inputs change
plot_inputs = [gpu_selector, omit_offload]
gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
with gr.TabItem("Benchmark Info"):
gr.Dataframe(
headers=["Parameter", "Value"],
value=[
["Transformers Version", "4.29dev0"],
["Pytorch Version", "2.0.0"],
["OS", "22.04 LTS (3090) / Debian 10 (other GPUs)"],
["CUDA", "11.8 (3090) / 11.3 (others GPUs)"],
["Number of input samples", "20-100 (depending on the model size)"],
["Is there code to reproduce?", "Yes -- https://github.com/gante/huggingface-demos/tree/main/experiments/faster_generation"],
],
)
demo.launch()