File size: 8,561 Bytes
42411ac
 
 
 
 
 
 
8cd56a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42411ac
 
 
 
 
 
0a426a3
8cd56a3
 
 
 
 
 
 
 
 
 
 
 
 
 
0a426a3
42411ac
 
0a426a3
42411ac
 
8cd56a3
42411ac
 
 
 
 
 
 
 
 
 
8cd56a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42411ac
 
 
92f6957
467bc7f
fff8cc6
 
c6f01c1
 
0a426a3
 
 
 
 
42411ac
 
0a426a3
 
 
42411ac
 
 
0a426a3
 
 
 
42411ac
8cd56a3
42411ac
0a426a3
771a08f
0a426a3
771a08f
42411ac
8cd56a3
 
 
42411ac
8cd56a3
 
 
 
 
42411ac
8cd56a3
 
 
 
 
 
 
 
 
 
 
 
42411ac
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import gradio as gr
import pandas as pd
import os
import plotly.express as px
import numpy as np

datadir = 'data/emissions/complete'
seq2seq_finetuned = ['sshleifer/distilbart-xsum-12-6', 'sshleifer/distilbart-cnn-12-6', 'sshleifer/distilbart-cnn-6-6',
                     'pszemraj/led-large-book-summary', 'google/pegasus-xsum', 'google/pegasus-large',
                     'google/pegasus-multi_news' ,'facebook/bart-large-cnn', 'ainize/bart-base-cnn']
color_discrete_map = {'Task-specific Encoder': '#636EFA', 'Multi-purpose Seq2Seq': '#AB63FA', 'Multi-purpose Decoder': '#00CC96', 'Task-specific Seq2Seq':'#EF553B'}

def multi_check(mname):
    if 'flan' in mname:
        return 'Seq2Seq'
    elif 'bloomz' in mname:
        return 'Decoder'

def encoder_check(mname):
    if 'flan' in mname:
        return 'Multi-purpose Seq2Seq'
    elif mname in seq2seq_finetuned:
        return 'Task-specific Seq2Seq'
    elif 'bloomz' in mname:
        return 'Multi-purpose Decoder'
    else:
        return 'Task-specific Encoder'
# Data loading

model_param_df = pd.read_csv('data/model_parameters.csv', header=0)
model_performance_df = pd.read_csv('data/performance.csv', header=0)
emissions_df = pd.read_csv('data/co2_data.csv',header=0)
modalities_df = pd.read_csv('data/modalities_data.csv',header=0)
finetuned_df = emissions_df[~emissions_df['task'].str.contains('zero')]
finetuned_df['task'] = finetuned_df['task'].str.replace('_',' ')
zeroshot_df = emissions_df[emissions_df['task'].str.contains('zero')]
zeroshot_df['task'] = zeroshot_df['task'].str.replace('_',' ')
zeroshot_df['architecture_type'] = zeroshot_df.apply(lambda x : multi_check(x.model),axis=1)
grouped_df = emissions_df.groupby(['model','task']).mean()
grouped_df = grouped_df.reset_index()
grouped_df = grouped_df.drop('task',axis=1)
performance_all = pd.merge(grouped_df, model_performance_df, on='model')
performance_all['type']= performance_all.apply(lambda x : encoder_check(x.model),axis=1)
performance_all['log_emissions'] = np.log1p(performance_all["query emissions (g)"])
sent_df = performance_all[['imdb (acc)','sst2 (acc)','tomatoes (acc)', "query emissions (g)", 'model','type','num_params', 'log_emissions']][performance_all['task'].isin(['sentiment'])]
qa_df = performance_all[['sciq (acc)', 'squad (f1)', 'squad_v2 (f1, has answer)', "query emissions (g)", 'model','type','num_params', 'log_emissions']][performance_all['task'].isin(['qa'])]
summ_df = performance_all[['samsum (rouge)', 'xsum (rouge)', 'cnn (rouge)', "query emissions (g)", 'model','type', 'num_params','log_emissions']][performance_all['task'].isin(['summarization'])]

# Figure loading
fig0 = px.scatter(emissions_df, x="num_params", y="query emissions (g)", color="model", log_x=True, log_y=True)
fig0.update_layout(xaxis={'categoryorder':'mean ascending'})
fig0.update_layout(yaxis_title='Total carbon emitted (g)')
fig0.update_layout(xaxis_title='Number of Parameters')


fig1 = px.scatter(finetuned_df, x="task", y="query_energy (kWh)", color="model", log_y=True)
fig1.update_layout(xaxis={'categoryorder':'mean ascending'})
fig1.update_layout(yaxis_title='Total energy used (Wh)')
fig1.update_layout(xaxis_title='Task')

fig2 = px.scatter(modalities_df, x="num_params", y="query emissions (g)", color="modality",
             log_x=True, log_y=True, custom_data=['model','task'])
fig2.update_layout(xaxis_title='Model size (number of parameters)')
fig2.update_layout(yaxis_title='Model emissions (g of CO<sub>2</sub>)')


fig3 = px.scatter(zeroshot_df, x="model", y="query emissions (g)", color="architecture_type", size='num_params', log_y=True)
fig3.update_layout(xaxis={'categoryorder':'mean ascending'})
fig3.update_layout(yaxis_title='Model emissions (g of CO<sub>2</sub>)')
fig3.update_layout(xaxis_title='Model')

fig4 = px.scatter(zeroshot_df, x="dataset", y="query emissions (g)", color="model", size='num_params', log_y=True)
fig4.update_layout(xaxis={'categoryorder':'mean ascending'})
fig4.update_layout(yaxis_title='Model emissions (g of CO<sub>2</sub>)')
fig4.update_layout(xaxis_title='Model')

fig5 = px.scatter(sent_df, y=['imdb (acc)', 'sst2 (acc)', 'tomatoes (acc)'], x="num_params", color="type", color_discrete_map=color_discrete_map,
                 size= "log_emissions", log_x=True, hover_data="model")
fig5.update_layout(legend=dict(y=-0.4,x=0.3))
fig5.update_layout(yaxis_title='Text Classification Accuracy')

fig6 = px.scatter(qa_df, y=['sciq (acc)', 'squad (f1)', 'squad_v2 (f1, has answer)'], x="num_params", color="type",
                 size = 'log_emissions', log_x=True, hover_data="model")
fig6.update_layout(legend=dict(y=-0.4,x=0.3))
fig6.update_layout(yaxis_title='QA accuracy/F1')

fig7 = px.scatter(summ_df, y=['samsum (rouge)', 'xsum (rouge)', 'cnn (rouge)'], x="num_params", color="type",
                 size = 'log_emissions', log_x=True, hover_data="model")
fig7.update_layout(legend=dict(y=-0.4,x=0.3))
fig7.update_layout(yaxis_title='Summarization Rouge Score')


demo = gr.Blocks()

with demo:
    gr.Markdown("# CO2 Inference Demo 🌎 πŸ’» ⚑")
    gr.Markdown("### TL;DR - We ran a series of experiments to measure the energy efficiency and carbon emissions of different\
    models from the HuggingFace Hub, and to see how different tasks and models compare.\
        We found that multi-purpose, generative models are orders of magnitude more energy-intensive than task-specific systems\
        for a variety of tasks, even for models with a similar number of parameters")
    gr.Markdown("### Explore the plots below to get more insights about the different models and tasks from our study.")
    with gr.Accordion("More details about our methodology:", open=False):
        gr.Markdown("We chose ten ML tasks: text classification, token classification, question answering, \
        ), masked language modeling, text generation, summarization, image classification, object detection, \
         image captioning and image generation. For each of the taks, we chose three of the most downloaded datasets and 8 of the most \
        downloaded models from the Hugging Face Hub. We ran each of the models ten times over a 1,000 sample from each of the models and measured the energy consumed and carbon emitted.")
    with gr.Row():
        with gr.Column():
            gr.Markdown("## All models from our study (carbon)")
            gr.Markdown("### Double click on the model name in the list on the right to isolate its datapoints:")
            gr.Markdown("The axes of the plot are in logarithmic scale, meaning that the difference between the least carbon-intensive and the most carbon-intensive models is over 9,000 times!")
            gr.Plot(fig0)
    with gr.Row():
        with gr.Column():
            gr.Markdown("## Task-by-task comparison (energy)")
            gr.Markdown("### Grouping the models by task, we can see different patterns emerge:")
            gr.Markdown("Image generation is by far the most energy- and carbon-intensive task from the ones studied, and text classification \
            is the least.")
            gr.Plot(fig1)
    with gr.Row():
        with gr.Column():
            gr.Markdown("##  Modality comparison (carbon)")
            gr.Markdown("### Grouping the models by their modality shows different characteristics:")
            gr.Markdown("We can see that tasks involving images (image-to-text, image-to-category) require more energy and emit more carbon\
            than ones involving text.")
            gr.Plot(fig2)
    gr.Markdown("## Multi-task model comparison (carbon)")
    gr.Markdown("### Looking at the emissions of multi-task models, we can see that decoder-only models tend to emit more carbon compared to sequence-to-sequence ones.")
    gr.Markdown("### This pattern varies depending on the dataset and task - for summarization datasets (the 3 rightmost ones), the difference between models is less obvious.")

    with gr.Row():
        with gr.Column():
            gr.Plot(fig3)
        with gr.Column():
            gr.Plot(fig4)

    gr.Markdown("## Evaluations (accuracy vs carbon)")
    gr.Markdown("### Single-task models are, ceteris paribus, less carbon-intensive than multi-task models for all 3 tasks we looked at: ")
    with gr.Row():
        with gr.Column():
            gr.Markdown("### Sentiment Analysis")
            gr.Plot(fig5)
        with gr.Column():
            gr.Markdown("### Question Answering")
            gr.Plot(fig6)
        with gr.Column():
            gr.Markdown("### Summarization")
            gr.Plot(fig7)


demo.launch()