Corey Morris commited on
Commit
298ba1f
1 Parent(s): 0c07f8b

copied main streamlit application to one that will specifically investigate moral reasoning

Browse files
Files changed (1) hide show
  1. moral_app.py +374 -0
moral_app.py ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ from result_data_processor import ResultDataProcessor
5
+ import matplotlib.pyplot as plt
6
+ import numpy as np
7
+ import plotly.graph_objects as go
8
+
9
+ st.set_page_config(layout="wide")
10
+
11
+ def plot_top_n(df, target_column, n=10):
12
+ top_n = df.nlargest(n, target_column)
13
+
14
+ # Initialize the bar plot
15
+ fig, ax1 = plt.subplots(figsize=(10, 5))
16
+
17
+ # Set width for each bar and their positions
18
+ width = 0.28
19
+ ind = np.arange(len(top_n))
20
+
21
+ # Plot target_column and MMLU_average on the primary y-axis with adjusted positions
22
+ ax1.bar(ind - width, top_n[target_column], width=width, color='blue', label=target_column)
23
+ ax1.bar(ind, top_n['MMLU_average'], width=width, color='orange', label='MMLU_average')
24
+
25
+ # Set the primary y-axis labels and title
26
+ ax1.set_title(f'Top {n} performing models on {target_column}')
27
+ ax1.set_xlabel('Model')
28
+ ax1.set_ylabel('Score')
29
+
30
+ # Create a secondary y-axis for Parameters
31
+ ax2 = ax1.twinx()
32
+
33
+ # Plot Parameters as bars on the secondary y-axis with adjusted position
34
+ ax2.bar(ind + width, top_n['Parameters'], width=width, color='red', label='Parameters')
35
+
36
+ # Set the secondary y-axis labels
37
+ ax2.set_ylabel('Parameters', color='red')
38
+ ax2.tick_params(axis='y', labelcolor='red')
39
+
40
+ # Set the x-ticks and their labels
41
+ ax1.set_xticks(ind)
42
+ ax1.set_xticklabels(top_n.index, rotation=45, ha="right")
43
+
44
+ # Adjust the legend
45
+ fig.tight_layout()
46
+ fig.legend(loc='center left', bbox_to_anchor=(1, 0.5))
47
+
48
+ # Show the plot
49
+ st.pyplot(fig)
50
+
51
+ # Function to create an unfilled radar chart
52
+ def create_radar_chart_unfilled(df, model_names, metrics):
53
+ fig = go.Figure()
54
+ min_value = df.loc[model_names, metrics].min().min()
55
+ max_value = df.loc[model_names, metrics].max().max()
56
+ for model_name in model_names:
57
+ values_model = df.loc[model_name, metrics]
58
+ fig.add_trace(go.Scatterpolar(
59
+ r=values_model,
60
+ theta=metrics,
61
+ name=model_name
62
+ ))
63
+
64
+ fig.update_layout(
65
+ polar=dict(
66
+ radialaxis=dict(
67
+ visible=True,
68
+ range=[min_value, max_value]
69
+ )),
70
+ showlegend=True,
71
+ width=800, # Change the width as needed
72
+ height=600 # Change the height as needed
73
+ )
74
+ return fig
75
+
76
+
77
+
78
+ # Function to create a line chart
79
+ def create_line_chart(df, model_names, metrics):
80
+ line_data = []
81
+ for model_name in model_names:
82
+ values_model = df.loc[model_name, metrics]
83
+ for metric, value in zip(metrics, values_model):
84
+ line_data.append({'Model': model_name, 'Metric': metric, 'Value': value})
85
+
86
+ line_df = pd.DataFrame(line_data)
87
+
88
+ fig = px.line(line_df, x='Metric', y='Value', color='Model', title='Comparison of Models', line_dash_sequence=['solid'])
89
+ fig.update_layout(showlegend=True)
90
+ return fig
91
+
92
+ def find_top_differences_table(df, target_model, closest_models, num_differences=10, exclude_columns=['Parameters', 'organization']):
93
+ # Calculate the absolute differences for each task between the target model and the closest models
94
+ new_df = df.drop(columns=exclude_columns)
95
+ differences = new_df.loc[closest_models].sub(new_df.loc[target_model]).abs()
96
+ # Unstack the differences and sort by the largest absolute difference
97
+ top_differences = differences.unstack().nlargest(num_differences)
98
+ # Convert the top differences to a DataFrame for display
99
+ top_differences_table = pd.DataFrame({
100
+ 'Task': [idx[0] for idx in top_differences.index],
101
+ 'Difference': top_differences.values
102
+ })
103
+ # Ensure that only unique tasks are returned
104
+ unique_top_differences_tasks = list(set(top_differences_table['Task'].tolist()))
105
+ return top_differences_table, unique_top_differences_tasks
106
+
107
+ data_provider = ResultDataProcessor()
108
+
109
+ st.title('Why are large language models so bad at the moral scenarios task?')
110
+ st.markdown("""
111
+ Here I am to answer the question: Why are large language models so bad at the moral scenarios task?
112
+ Sub questions:
113
+ - Are the models actually bad at moral reasoning ?
114
+ - Is it the structure of the task that is the causing the poor performance ?
115
+ - Are there other tasks with questions in a similar structure ?
116
+ - How do models perform when the structure of the task is changed ?
117
+ """)
118
+
119
+ filters = st.checkbox('Select Models and/or Evaluations')
120
+
121
+ # Initialize selected columns with "Parameters" and "MMLU_average" if filters are checked
122
+ selected_columns = ['Parameters', 'MMLU_average'] if filters else data_provider.data.columns.tolist()
123
+
124
+ # Initialize selected models as empty if filters are checked
125
+ selected_models = [] if filters else data_provider.data.index.tolist()
126
+
127
+ if filters:
128
+ # Create multi-select for columns with default selection
129
+ selected_columns = st.multiselect(
130
+ 'Select Columns',
131
+ data_provider.data.columns.tolist(),
132
+ default=selected_columns
133
+ )
134
+
135
+ # Create multi-select for models without default selection
136
+ selected_models = st.multiselect(
137
+ 'Select Models',
138
+ data_provider.data.index.tolist()
139
+ )
140
+
141
+ # Get the filtered data
142
+ filtered_data = data_provider.get_data(selected_models)
143
+
144
+ # sort the table by the MMLU_average column
145
+ filtered_data = filtered_data.sort_values(by=['MMLU_average'], ascending=False)
146
+
147
+ # Select box for filtering by Parameters
148
+ parameter_threshold = st.selectbox(
149
+ 'Filter by Parameters (Less Than or Equal To):',
150
+ options=[3, 7, 13, 35, 'No threshold'],
151
+ index=4, # Set the default selected option to 'No threshold'
152
+ format_func=lambda x: f"{x}" if isinstance(x, int) else x
153
+ )
154
+
155
+ # Filter the DataFrame based on the selected parameter threshold if not 'No threshold'
156
+ if isinstance(parameter_threshold, int):
157
+ filtered_data = filtered_data[filtered_data['Parameters'] <= parameter_threshold]
158
+
159
+
160
+ # Search box
161
+ search_query = st.text_input("Filter by Model Name:", "")
162
+
163
+ # Filter the DataFrame based on the search query in the index (model name)
164
+ if search_query:
165
+ filtered_data = filtered_data[filtered_data.index.str.contains(search_query, case=False)]
166
+
167
+
168
+ # Search box for columns
169
+ column_search_query = st.text_input("Filter by Column/Task Name:", "")
170
+
171
+ # Get the columns that contain the search query
172
+ matching_columns = [col for col in filtered_data.columns if column_search_query.lower() in col.lower()]
173
+
174
+ # Display the DataFrame with only the matching columns
175
+ st.markdown("## Sortable Results")
176
+ st.dataframe(filtered_data[matching_columns])
177
+
178
+
179
+ # CSV download
180
+
181
+ filtered_data.index.name = "Model Name"
182
+
183
+ csv = filtered_data.to_csv(index=True)
184
+ st.download_button(
185
+ label="Download data as CSV",
186
+ data=csv,
187
+ file_name="model_evaluation_results.csv",
188
+ mime="text/csv",
189
+ )
190
+
191
+
192
+ def create_plot(df, x_values, y_values, models=None, title=None):
193
+ if models is not None:
194
+ df = df[df.index.isin(models)]
195
+
196
+ # remove rows with NaN values
197
+ df = df.dropna(subset=[x_values, y_values])
198
+
199
+ plot_data = pd.DataFrame({
200
+ 'Model': df.index,
201
+ x_values: df[x_values],
202
+ y_values: df[y_values],
203
+ })
204
+
205
+ plot_data['color'] = 'purple'
206
+ fig = px.scatter(plot_data, x=x_values, y=y_values, color='color', hover_data=['Model'], trendline="ols")
207
+
208
+ # If title is not provided, use x_values vs. y_values as the default title
209
+ if title is None:
210
+ title = x_values + " vs. " + y_values
211
+
212
+ layout_args = dict(
213
+ showlegend=False,
214
+ xaxis_title=x_values,
215
+ yaxis_title=y_values,
216
+ xaxis=dict(),
217
+ yaxis=dict(),
218
+ title=title,
219
+ height=500,
220
+ width=1000,
221
+ )
222
+ fig.update_layout(**layout_args)
223
+
224
+ # Add a dashed line at 0.25 for the y_values
225
+ x_min = df[x_values].min()
226
+ x_max = df[x_values].max()
227
+
228
+ y_min = df[y_values].min()
229
+ y_max = df[y_values].max()
230
+
231
+ if x_values.startswith('MMLU'):
232
+ fig.add_shape(
233
+ type='line',
234
+ x0=0.25, x1=0.25,
235
+ y0=y_min, y1=y_max,
236
+ line=dict(
237
+ color='red',
238
+ width=2,
239
+ dash='dash'
240
+ )
241
+ )
242
+
243
+ if y_values.startswith('MMLU'):
244
+ fig.add_shape(
245
+ type='line',
246
+ x0=x_min, x1=x_max,
247
+ y0=0.25, y1=0.25,
248
+ line=dict(
249
+ color='red',
250
+ width=2,
251
+ dash='dash'
252
+ )
253
+ )
254
+
255
+ return fig
256
+
257
+
258
+ # Custom scatter plots
259
+ st.header('Custom scatter plots')
260
+ st.write("""
261
+ The scatter plot is useful to identify models that outperform or underperform on a particular task in relation to their size or overall performance.
262
+ Identifying these models is a first step to better understand what training strategies result in better performance on a particular task.
263
+ """)
264
+ st.markdown("***The dashed red line indicates random chance accuracy of 0.25 as the MMLU evaluation is multiple choice with 4 response options.***")
265
+ # add a line separating the writing
266
+ st.markdown("***")
267
+ st.write("As expected, there is a strong positive relationship between the number of parameters and average performance on the MMLU evaluation.")
268
+
269
+ selected_x_column = st.selectbox('Select x-axis', filtered_data.columns.tolist(), index=0)
270
+ selected_y_column = st.selectbox('Select y-axis', filtered_data.columns.tolist(), index=3)
271
+
272
+ if selected_x_column != selected_y_column: # Avoid creating a plot with the same column on both axes
273
+ fig = create_plot(filtered_data, selected_x_column, selected_y_column)
274
+ st.plotly_chart(fig)
275
+ else:
276
+ st.write("Please select different columns for the x and y axes.")
277
+
278
+
279
+
280
+
281
+ # end of custom scatter plots
282
+
283
+ # Section to select a model and display radar and line charts
284
+ st.header("Compare a Selected Model to the 5 Models Closest in MMLU Average Performance")
285
+ st.write("""
286
+ This comparison highlights the nuances in model performance across different tasks.
287
+ While the overall MMLU average score provides a general understanding of a model's capabilities,
288
+ examining the closest models reveals variations in performance on individual tasks.
289
+ Such an analysis can uncover specific strengths and weaknesses and guide further exploration and improvement.
290
+ """)
291
+
292
+ default_model_name = "GPT-JT-6B-v0"
293
+
294
+ default_model_index = filtered_data.index.tolist().index(default_model_name) if default_model_name in filtered_data.index else 0
295
+ selected_model_name = st.selectbox("Select a Model:", filtered_data.index.tolist(), index=default_model_index)
296
+
297
+ # Get the closest 5 models with unique indices
298
+ closest_models_diffs = filtered_data['MMLU_average'].sub(filtered_data.loc[selected_model_name, 'MMLU_average']).abs()
299
+ closest_models = closest_models_diffs.nsmallest(5, keep='first').index.drop_duplicates().tolist()
300
+
301
+
302
+ # Find the top 10 tasks with the largest differences and convert to a DataFrame
303
+ top_differences_table, top_differences_tasks = find_top_differences_table(filtered_data, selected_model_name, closest_models)
304
+
305
+ # Display the DataFrame for the closest models and the top differences tasks
306
+ st.dataframe(filtered_data.loc[closest_models, top_differences_tasks])
307
+
308
+ # # Display the table in the Streamlit app
309
+ # st.markdown("## Top Differences")
310
+ # st.dataframe(top_differences_table)
311
+
312
+ # Create a radar chart for the tasks with the largest differences
313
+ fig_radar_top_differences = create_radar_chart_unfilled(filtered_data, closest_models, top_differences_tasks)
314
+
315
+ # Display the radar chart
316
+ st.plotly_chart(fig_radar_top_differences)
317
+
318
+
319
+ st.markdown("## Notable findings and plots")
320
+
321
+ st.markdown('### Abstract Algebra Performance')
322
+ st.write("Small models showed surprisingly strong performance on the abstract algebra task. A 6 Billion parameter model is tied for the best performance on this task and there are a number of other small models in the top 10.")
323
+ plot_top_n(filtered_data, 'MMLU_abstract_algebra', 10)
324
+
325
+ fig = create_plot(filtered_data, 'Parameters', 'MMLU_abstract_algebra')
326
+ st.plotly_chart(fig)
327
+
328
+ # Moral scenarios plots
329
+ st.markdown("### Moral Scenarios Performance")
330
+ def show_random_moral_scenarios_question():
331
+ moral_scenarios_data = pd.read_csv('moral_scenarios_questions.csv')
332
+ random_question = moral_scenarios_data.sample()
333
+ expander = st.expander("Show a random moral scenarios question")
334
+ expander.write(random_question['query'].values[0])
335
+
336
+ show_random_moral_scenarios_question()
337
+
338
+ st.write("""
339
+ While smaller models can perform well at many tasks, the model size threshold for decent performance on moral scenarios is much higher.
340
+ There are no models with less than 13 billion parameters with performance much better than random chance. Further investigation into other capabilities that emerge at 13 billion parameters could help
341
+ identify capabilities that are important for moral reasoning.
342
+ """)
343
+
344
+ fig = create_plot(filtered_data, 'Parameters', 'MMLU_moral_scenarios', title="Impact of Parameter Count on Accuracy for Moral Scenarios")
345
+ st.plotly_chart(fig)
346
+ st.write()
347
+
348
+
349
+
350
+ fig = create_plot(filtered_data, 'MMLU_average', 'MMLU_moral_scenarios')
351
+ st.plotly_chart(fig)
352
+
353
+
354
+
355
+
356
+ st.markdown("***Thank you to hugging face for running the evaluations and supplying the data as well as the original authors of the evaluations.***")
357
+
358
+ st.markdown("""
359
+ # Citation
360
+
361
+ 1. Corey Morris (2023). *Exploring the Characteristics of Large Language Models: An Interactive Portal for Analyzing 700+ Open Source Models Across 57 Diverse Evaluation Tasks*. [link](https://huggingface.co/spaces/CoreyMorris/MMLU-by-task-Leaderboard)
362
+
363
+ 2. Edward Beeching, Clémentine Fourrier, Nathan Habib, Sheon Han, Nathan Lambert, Nazneen Rajani, Omar Sanseviero, Lewis Tunstall, Thomas Wolf. (2023). *Open LLM Leaderboard*. Hugging Face. [link](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
364
+
365
+ 3. Gao, Leo et al. (2021). *A framework for few-shot language model evaluation*. Zenodo. [link](https://doi.org/10.5281/zenodo.5371628)
366
+
367
+ 4. Peter Clark, Isaac Cowhey, Oren Etzioni, Tushar Khot, Ashish Sabharwal, Carissa Schoenick, Oyvind Tafjord. (2018). *Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge*. arXiv. [link](https://arxiv.org/abs/1803.05457)
368
+
369
+ 5. Rowan Zellers, Ari Holtzman, Yonatan Bisk, Ali Farhadi, Yejin Choi. (2019). *HellaSwag: Can a Machine Really Finish Your Sentence?*. arXiv. [link](https://arxiv.org/abs/1905.07830)
370
+
371
+ 6. Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, Jacob Steinhardt. (2021). *Measuring Massive Multitask Language Understanding*. arXiv. [link](https://arxiv.org/abs/2009.03300)
372
+
373
+ 7. Stephanie Lin, Jacob Hilton, Owain Evans. (2022). *TruthfulQA: Measuring How Models Mimic Human Falsehoods*. arXiv. [link](https://arxiv.org/abs/2109.07958)
374
+ """)