|
import matplotlib |
|
matplotlib.use('Agg') |
|
|
|
import functools |
|
import gradio as gr |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
import pandas as pd |
|
|
|
|
|
FIGURE_PATH = "plt.png" |
|
FIG_DPI = 300 |
|
|
|
|
|
def get_plot(task, gpu, omit_offload): |
|
|
|
df = pd.read_csv("data.csv") |
|
df = df[df["task"] == task] |
|
df = df[df["gpu"] == gpu] |
|
if omit_offload == "Yes": |
|
df = df[df["offload"] == 0] |
|
|
|
|
|
df["model and dtype"] = df['model_name'].str.cat(df[['dtype']], sep=', ') |
|
|
|
|
|
df = df.melt( |
|
id_vars=["task", "gpu", "model and dtype", "offload"], |
|
value_vars=["Greedy", "Assisted"], |
|
var_name="generation_type", |
|
value_name="generation_time", |
|
) |
|
|
|
g = sns.catplot( |
|
data=df, |
|
kind="bar", |
|
x="model and dtype", |
|
y="generation_time", |
|
hue="generation_type", |
|
palette={"Greedy": "blue", "Assisted": "orange"}, |
|
alpha=.9, |
|
) |
|
g.despine(left=True) |
|
g.set_axis_labels("Model size and dtype", "Latency (ms/token)") |
|
g.set_xticklabels(fontsize=7) |
|
g.set_yticklabels(fontsize=7) |
|
g.legend.set_title("Generation Type") |
|
plt.setp(g._legend.get_texts(), fontsize='7') |
|
|
|
|
|
ax = g.facet_axis(0, 0) |
|
for i in ax.containers: |
|
ax.bar_label(i, fontsize=7) |
|
|
|
plt.savefig(FIGURE_PATH, dpi=FIG_DPI) |
|
return FIGURE_PATH |
|
|
|
|
|
demo = gr.Blocks() |
|
|
|
with demo: |
|
gr.Markdown( |
|
""" |
|
# Assisted Generation Benchmark |
|
""" |
|
) |
|
|
|
omit_offload_fn = functools.partial( |
|
gr.Radio, ["Yes", "No"], value="No", label="Omit cases with memory offload?", interactive=True |
|
) |
|
|
|
def gpu_selector_fn(gpu_list): |
|
return gr.Dropdown( |
|
gpu_list, value=gpu_list[-1], label="GPU", interactive=True |
|
) |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("OPT: Open"): |
|
plot_fn = functools.partial(get_plot, "OPT: Open Text Generation") |
|
with gr.Row(): |
|
with gr.Column(scale=0.3): |
|
gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"]) |
|
omit_offload = omit_offload_fn() |
|
gr.Markdown( |
|
""" |
|
### Assistant Model |
|
- `facebook/opt-125m` |
|
|
|
### Model Names: |
|
- 1.3B: `facebook/opt-1.3b` |
|
- 6.7B: `facebook/opt-6.7b` |
|
- 30B: `facebook/opt-30b` |
|
- 66B: `facebook/opt-66b` |
|
|
|
### Dataset used as input prompt: |
|
- C4 (en, validation set) |
|
""" |
|
) |
|
|
|
plot = gr.Image(value=plot_fn("A100 (80GB)", "No")) |
|
|
|
plot_inputs = [gpu_selector, omit_offload] |
|
gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot) |
|
omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot) |
|
with gr.TabItem("OPT: Summarization"): |
|
plot_fn = functools.partial(get_plot, "OPT: Summarization") |
|
with gr.Row(): |
|
with gr.Column(scale=0.3): |
|
gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"]) |
|
omit_offload = omit_offload_fn() |
|
gr.Markdown( |
|
""" |
|
### Assistant Model |
|
- `facebook/opt-125m` |
|
|
|
### Model Names: |
|
- 1.3B: `facebook/opt-1.3b` |
|
- 6.7B: `facebook/opt-6.7b` |
|
- 30B: `facebook/opt-30b` |
|
- 66B: `facebook/opt-66b` |
|
|
|
### Dataset used as input prompt: |
|
- CNN Dailymail (3.0.0, validation set) |
|
""" |
|
) |
|
|
|
plot = gr.Image(value=plot_fn("A100 (80GB)", "No")) |
|
|
|
plot_inputs = [gpu_selector, omit_offload] |
|
gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot) |
|
omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot) |
|
with gr.TabItem("Whisper: ARS"): |
|
plot_fn = functools.partial(get_plot, "Whisper: ARS") |
|
with gr.Row(): |
|
with gr.Column(scale=0.3): |
|
gpu_selector = gpu_selector_fn(["3090", "T4"]) |
|
omit_offload = omit_offload_fn() |
|
gr.Markdown( |
|
""" |
|
### Assistant Model |
|
- `openai/whisper-tiny` |
|
|
|
### Model Names: |
|
- large-v2: `openai/whisper-large-v2` |
|
|
|
### Dataset used as input prompt: |
|
- Librispeech ARS (clean, validation set) |
|
""" |
|
) |
|
|
|
plot = gr.Image(value=plot_fn("T4", "No")) |
|
|
|
plot_inputs = [gpu_selector, omit_offload] |
|
gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot) |
|
omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot) |
|
with gr.TabItem("CodeGen: Code"): |
|
plot_fn = functools.partial(get_plot, "CodeGen: Code Generation") |
|
with gr.Row(): |
|
with gr.Column(scale=0.3): |
|
gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"]) |
|
omit_offload = omit_offload_fn() |
|
gr.Markdown( |
|
""" |
|
### Assistant Model |
|
- `Salesforce/codegen-350M-mono` |
|
|
|
### Model Names: |
|
- 2B: `Salesforce/codegen-2B-mono` |
|
- 6B: `Salesforce/codegen-6B-mono` |
|
- 16B: `Salesforce/codegen-16B-mono` |
|
|
|
### Dataset used as input prompt: |
|
- The Stack (python) |
|
""" |
|
) |
|
|
|
plot = gr.Image(value=plot_fn("A100 (80GB)", "No")) |
|
|
|
plot_inputs = [gpu_selector, omit_offload] |
|
gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot) |
|
omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot) |
|
with gr.TabItem("Flan-T5: Summarization"): |
|
plot_fn = functools.partial(get_plot, "Flan-T5: Summarization") |
|
with gr.Row(): |
|
with gr.Column(scale=0.3): |
|
gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"]) |
|
omit_offload = omit_offload_fn() |
|
gr.Markdown( |
|
""" |
|
### Assistant Model |
|
- `google/flan-t5-small` |
|
|
|
### Model Names: |
|
- large: `google/flan-t5-large` |
|
- xl: `google/flan-t5-xl` |
|
- xxl: `google/flan-t5-xxl` |
|
- ul2: `google/flan-ul2` |
|
|
|
### Dataset used as input prompt: |
|
- CNN Dailymail (3.0.0, validation set) |
|
""" |
|
) |
|
|
|
plot = gr.Image(value=plot_fn("A100 (80GB)", "No")) |
|
|
|
plot_inputs = [gpu_selector, omit_offload] |
|
gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot) |
|
omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot) |
|
with gr.TabItem("Benchmark Info"): |
|
gr.Dataframe( |
|
headers=["Parameter", "Value"], |
|
value=[ |
|
["Transformers Version", "4.29dev0"], |
|
["Pytorch Version", "2.0.0"], |
|
["OS", "22.04 LTS (3090) / Debian 10 (other GPUs)"], |
|
["CUDA", "11.8 (3090) / 11.3 (others GPUs)"], |
|
["Number of input samples", "20-100 (depending on the model size)"], |
|
["Is there code to reproduce?", "Yes -- https://github.com/gante/huggingface-demos/tree/main/experiments/faster_generation"], |
|
], |
|
) |
|
|
|
demo.launch() |
|
|