justinxzhao commited on
Commit
7994525
1 Parent(s): e893baa

Add a 3rd tab to explore human cross annotations.

Browse files
app.py CHANGED
@@ -4,6 +4,13 @@ import plotly.express as px
4
  import plotly.graph_objects as go
5
  import statsmodels.api as sm
6
  import random
 
 
 
 
 
 
 
7
 
8
  # Set the layout to wide
9
  st.set_page_config(layout="wide")
@@ -21,6 +28,68 @@ h1, h2, h3, h6{
21
  st.markdown(center_css, unsafe_allow_html=True)
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def prep_rankings_table(df, y_column):
25
  # Create a copy of the dataframe.
26
  df_copy = df.copy()
@@ -67,6 +136,12 @@ def get_preference_from_rounded_score(score):
67
  # raise ValueError(f"Invalid score: {score}")
68
 
69
 
 
 
 
 
 
 
70
  def app():
71
  fixed_model = "gpt4_1106_preview"
72
 
@@ -77,6 +152,9 @@ def app():
77
  if "selected_model" not in st.session_state:
78
  st.session_state.selected_model = "gpt4"
79
 
 
 
 
80
  if "selected_judge" not in st.session_state:
81
  st.session_state.selected_judge = None
82
 
@@ -86,6 +164,12 @@ def app():
86
  if "instruction_options" not in st.session_state:
87
  st.session_state.instruction_options = []
88
 
 
 
 
 
 
 
89
  # Function to update the instruction options based on selected dataset
90
  def update_instruction_options():
91
  selected_dataset = st.session_state.dataset_selector
@@ -114,6 +198,32 @@ def app():
114
 
115
  st.session_state.instruction_options = instruction_options
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  def update_instruction():
118
  st.session_state.selected_instruction = st.session_state.instruction_selector
119
 
@@ -133,10 +243,27 @@ def app():
133
  st.session_state.instruction_options
134
  )
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  st.title("🦙 AlpacaEval Explorer 🦙")
137
 
138
  st.markdown(
139
- "### An interactive tool to analyze and explore the data behind the [AlpacaEval Leaderboard](https://tatsu-lab.github.io/alpaca_eval/) in more depth"
140
  )
141
 
142
  st.markdown(
@@ -149,9 +276,9 @@ def app():
149
  with st.expander("About AlpacaEval"):
150
  st.markdown(
151
  """- [AlpacaEval](https://github.com/tatsu-lab/alpaca_eval) is an evaluation benchmark to assess the performance of large language models (LLMs).
152
- - It has high correlation with Chatbot Arena, and is a fast and affordable benchmark for chat LLMs that uses LLMs (specifically GPT-4) to estimate response quality.
153
  - LLM responses are assessed in a pairwise fashion (arena), where each model's responses are compared to a reference model's responses.
154
- - The reference model is GPT-4-1106. The LLM Judge is also GPT-4-1106.
155
 
156
  """
157
  )
@@ -159,9 +286,10 @@ def app():
159
  with col2:
160
  with st.expander("About this tool"):
161
  st.markdown(
162
- """- There are 2 main tabs: **Data explorer** and **Length bias explorer**.
163
- - Use the Data explorer to look at individual pairwise battles between models.
164
- - Use the Length bias explorer to look at how response lengths affect win rates.
 
165
  """
166
  )
167
 
@@ -169,16 +297,23 @@ def app():
169
  with st.expander("Motivation"):
170
  st.markdown(
171
  """
172
- - Several arena-based benchmarks (ours included) have demonstrated that a clear ranking among LLMs can be established, but there is a general dearth of analysis and understanding as to why the rankings are the way they are. For example, it's hard to discern how factors like feel and style
173
  are weighed against correctness.
174
  - I created this tool to provide a more interactive and intuitive way to explore the data behind the AlpacaEval leaderboard. It allows users to easily compare responses between models, look at individual battles, and analyze how response lengths affect win rates.
175
- - If you have any feedback on the tool, please reach out on [Twitter](https://twitter.com/justinxzhao)!
176
  """
177
  )
178
 
179
- outer_tabs = st.tabs(["Data explorer", "Length bias explorer"])
 
 
 
 
 
 
180
 
181
  # Load the data
 
182
  df = pd.read_json("data/model_win_rates.jsonl", lines=True, orient="records")
183
  # df_responses = pd.read_json("data/df_responses.jsonl", lines=True, orient="records")
184
  df_response_judging = pd.read_json(
@@ -700,6 +835,167 @@ are weighed against correctness.
700
  else:
701
  st.error(response_details_dynamic["output_2"])
702
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
703
 
704
  if __name__ == "__main__":
705
  app()
 
4
  import plotly.graph_objects as go
5
  import statsmodels.api as sm
6
  import random
7
+ import pandas as pd
8
+ import numpy as np
9
+ import plotly.express as px
10
+ import plotly.graph_objects as go
11
+ import matplotlib.pyplot as plt
12
+ import seaborn as sns
13
+ from itertools import combinations
14
 
15
  # Set the layout to wide
16
  st.set_page_config(layout="wide")
 
28
  st.markdown(center_css, unsafe_allow_html=True)
29
 
30
 
31
+ def create_agreement_heatmap(df):
32
+ # Create a list of unique annotators and sort them by annotator index
33
+ unique_annotators = sorted(df["annotator_index"].unique())
34
+
35
+ # Initialize the agreement matrix and count matrix
36
+ agreement_matrix = pd.DataFrame(
37
+ np.nan, index=unique_annotators, columns=unique_annotators
38
+ )
39
+ count_matrix = pd.DataFrame(
40
+ np.zeros((len(unique_annotators), len(unique_annotators))),
41
+ index=unique_annotators,
42
+ columns=unique_annotators,
43
+ )
44
+
45
+ # Group by (instruction, output_1, output_2)
46
+ grouped = df.groupby(["instruction", "output_1", "output_2"])
47
+
48
+ for name, group in grouped:
49
+ # Extract annotators and their preferences
50
+ annotators = group["annotator_index"].values
51
+ preferences = group["preference"].values
52
+
53
+ # Iterate over all pairs of annotators in the group
54
+ for (annotator1, pref1), (annotator2, pref2) in combinations(
55
+ zip(annotators, preferences), 2
56
+ ):
57
+ if pref1 == pref2: # If they agree
58
+ if pd.isna(agreement_matrix.loc[annotator1, annotator2]):
59
+ agreement_matrix.loc[annotator1, annotator2] = 0
60
+ if pd.isna(agreement_matrix.loc[annotator2, annotator1]):
61
+ agreement_matrix.loc[annotator2, annotator1] = 0
62
+ agreement_matrix.loc[annotator1, annotator2] += 1
63
+ agreement_matrix.loc[annotator2, annotator1] += 1
64
+ count_matrix.loc[annotator1, annotator2] += 1
65
+ count_matrix.loc[annotator2, annotator1] += 1
66
+
67
+ # Normalize the agreement matrix by the count matrix
68
+ for i in unique_annotators:
69
+ for j in unique_annotators:
70
+ if count_matrix.loc[i, j] > 0:
71
+ agreement_matrix.loc[i, j] /= count_matrix.loc[i, j]
72
+
73
+ # Plot the heatmap
74
+ plt.figure(figsize=(10, 10)) # Make the heatmap square
75
+ sns.heatmap(
76
+ agreement_matrix,
77
+ annot=True,
78
+ fmt=".2f",
79
+ cmap="PiYG",
80
+ cbar=True,
81
+ mask=np.isnan(agreement_matrix),
82
+ vmin=0.0,
83
+ vmax=1.0,
84
+ square=True,
85
+ )
86
+ plt.title("Interannotator Agreement Heatmap")
87
+ plt.xlabel("Annotator")
88
+ plt.ylabel("Annotator")
89
+ plt.tight_layout()
90
+ return agreement_matrix
91
+
92
+
93
  def prep_rankings_table(df, y_column):
94
  # Create a copy of the dataframe.
95
  df_copy = df.copy()
 
136
  # raise ValueError(f"Invalid score: {score}")
137
 
138
 
139
+ def is_unanimous(series):
140
+ if len(set(series.tolist())) == 1:
141
+ return True
142
+ return False
143
+
144
+
145
  def app():
146
  fixed_model = "gpt4_1106_preview"
147
 
 
152
  if "selected_model" not in st.session_state:
153
  st.session_state.selected_model = "gpt4"
154
 
155
+ if "selected_output_human_annotations" not in st.session_state:
156
+ st.session_state.selected_output_human_annotations = None
157
+
158
  if "selected_judge" not in st.session_state:
159
  st.session_state.selected_judge = None
160
 
 
164
  if "instruction_options" not in st.session_state:
165
  st.session_state.instruction_options = []
166
 
167
+ if "instruction_options_human_annotations" not in st.session_state:
168
+ st.session_state.instruction_options_human_annotations = []
169
+
170
+ if "selected_instruction_human_annotations" not in st.session_state:
171
+ st.session_state.selected_instruction_human_annotations = None
172
+
173
  # Function to update the instruction options based on selected dataset
174
  def update_instruction_options():
175
  selected_dataset = st.session_state.dataset_selector
 
198
 
199
  st.session_state.instruction_options = instruction_options
200
 
201
+ def update_instruction_options_human_annotations():
202
+ selected_dataset = st.session_state.dataset_selector_human_annotations
203
+ if selected_dataset == "all" or selected_dataset == "NEW":
204
+ instruction_options = df_human_annotations["instruction"].unique().tolist()
205
+ elif (
206
+ selected_dataset == "None"
207
+ or selected_dataset is None
208
+ or str(selected_dataset) == ""
209
+ ):
210
+ instruction_options = (
211
+ df_human_annotations[pd.isna(df_human_annotations["dataset"])][
212
+ "instruction"
213
+ ]
214
+ .unique()
215
+ .tolist()
216
+ )
217
+ else:
218
+ instruction_options = (
219
+ df_human_annotations[
220
+ df_human_annotations["dataset"] == selected_dataset
221
+ ]["instruction"]
222
+ .unique()
223
+ .tolist()
224
+ )
225
+ st.session_state.instruction_options_human_annotations = instruction_options
226
+
227
  def update_instruction():
228
  st.session_state.selected_instruction = st.session_state.instruction_selector
229
 
 
243
  st.session_state.instruction_options
244
  )
245
 
246
+ def randomize_selection_human_annotations():
247
+ st.session_state.dataset_selector_human_annotations = random.choice(
248
+ ["all"] + df_human_annotations["dataset"].dropna().unique().tolist()
249
+ )
250
+ update_instruction_options()
251
+ st.session_state.selected_instruction_human_annotations = random.choice(
252
+ st.session_state.instruction_options_human_annotations
253
+ )
254
+ st.session_state.selected_output_human_annotations = random.choice(
255
+ df_human_annotations[
256
+ df_human_annotations["instruction"]
257
+ == st.session_state.selected_instruction_human_annotations
258
+ ]["output_2"]
259
+ .dropna()
260
+ .tolist()
261
+ )
262
+
263
  st.title("🦙 AlpacaEval Explorer 🦙")
264
 
265
  st.markdown(
266
+ "###### An interactive tool to analyze and explore the data behind the [AlpacaEval Leaderboard](https://tatsu-lab.github.io/alpaca_eval/) in more depth"
267
  )
268
 
269
  st.markdown(
 
276
  with st.expander("About AlpacaEval"):
277
  st.markdown(
278
  """- [AlpacaEval](https://github.com/tatsu-lab/alpaca_eval) is an evaluation benchmark to assess the performance of large language models (LLMs).
279
+ - It has high correlation with [Chatbot Arena](https://chat.lmsys.org/), and is a fast and affordable benchmark for chat LLMs that uses LLMs (specifically GPT-4) to estimate response quality.
280
  - LLM responses are assessed in a pairwise fashion (arena), where each model's responses are compared to a reference model's responses.
281
+ - All reference responses are generated by GPT-4-1106. The LLM Judge is also GPT-4-1106.
282
 
283
  """
284
  )
 
286
  with col2:
287
  with st.expander("About this tool"):
288
  st.markdown(
289
+ """There are 3 main tabs.
290
+ 1. Use the **Data explorer** to look at individual pairwise battles between models.
291
+ 2. Use the **Length bias explorer** to look at how response lengths affect win rates.
292
+ 3. Use the **Human cross annotations** tab to explore the human cross annotations.
293
  """
294
  )
295
 
 
297
  with st.expander("Motivation"):
298
  st.markdown(
299
  """
300
+ - Several arena-based benchmarks have demonstrated that a clear ranking among LLMs can be established, but there is a general dearth of analysis and understanding as to why the rankings are the way they are. For example, it's hard to discern how factors like feel and style
301
  are weighed against correctness.
302
  - I created this tool to provide a more interactive and intuitive way to explore the data behind the AlpacaEval leaderboard. It allows users to easily compare responses between models, look at individual battles, and analyze how response lengths affect win rates.
303
+ - If you have any feedback on the tool, please reach out.
304
  """
305
  )
306
 
307
+ outer_tabs = st.tabs(
308
+ [
309
+ "Data explorer",
310
+ "Length bias explorer",
311
+ "Human cross annotations",
312
+ ]
313
+ )
314
 
315
  # Load the data
316
+ df_human_annotations = pd.read_json("data/alpaca_farm_human_crossannotations.json")
317
  df = pd.read_json("data/model_win_rates.jsonl", lines=True, orient="records")
318
  # df_responses = pd.read_json("data/df_responses.jsonl", lines=True, orient="records")
319
  df_response_judging = pd.read_json(
 
835
  else:
836
  st.error(response_details_dynamic["output_2"])
837
 
838
+ with outer_tabs[2]:
839
+ st.markdown(
840
+ """The original [AlpacaFarm paper](https://arxiv.org/abs/2305.14387) includes a release of 20K human preferences between a given and reference model on the AlpacaFarm evaluation set. 2.5K of these are cross-annotations (4 humans annotating the same 650 examples). This tab allows you to explore the **human cross-annotations** in more detail."""
841
+ )
842
+
843
+ st.markdown("#### Choose example")
844
+ st.button(
845
+ ":game_die: Randomize!",
846
+ on_click=randomize_selection_human_annotations,
847
+ type="primary",
848
+ key="randomize_button_human_annotations",
849
+ )
850
+
851
+ left_col, right_col = st.columns([1, 3])
852
+
853
+ st.session_state.selected_dataset_human_annotations = left_col.selectbox(
854
+ "Select Dataset",
855
+ ["all"] + df_human_annotations["dataset"].dropna().unique().tolist(),
856
+ key="dataset_selector_human_annotations",
857
+ on_change=update_instruction_options_human_annotations,
858
+ )
859
+ update_instruction_options_human_annotations()
860
+ st.session_state.selected_instruction_human_annotations = right_col.selectbox(
861
+ f"Select Instruction ({len(st.session_state.instruction_options_human_annotations)} unique instructions)",
862
+ st.session_state.instruction_options_human_annotations,
863
+ key="instruction_selector_human_annotations",
864
+ on_change=update_instruction,
865
+ index=(
866
+ st.session_state.instruction_options_human_annotations.index(
867
+ st.session_state.selected_instruction_human_annotations
868
+ )
869
+ if st.session_state.selected_instruction_human_annotations
870
+ in st.session_state.instruction_options_human_annotations
871
+ else 0
872
+ ),
873
+ )
874
+
875
+ st.divider()
876
+
877
+ st.markdown(f"#### Selected instruction")
878
+ st.info(st.session_state.selected_instruction_human_annotations)
879
+
880
+ st.divider()
881
+
882
+ # Need an output column?
883
+
884
+ st.markdown("#### Responses")
885
+ col1, col2 = st.columns(2)
886
+
887
+ with col1:
888
+ st.selectbox(
889
+ "Output 1 (reference)",
890
+ df_human_annotations.loc[
891
+ df_human_annotations["instruction"]
892
+ == st.session_state.selected_instruction_human_annotations
893
+ ]["output_1"]
894
+ .unique()
895
+ .tolist(),
896
+ key="output_selector_human_annotations_fuxed",
897
+ index=0,
898
+ # label_visibility="collapsed",
899
+ )
900
+
901
+ # Get the response string for the fixed model
902
+ if st.session_state.selected_instruction_human_annotations:
903
+ response_details_fixed = df_human_annotations[
904
+ (
905
+ df_human_annotations["instruction"]
906
+ == st.session_state.selected_instruction_human_annotations
907
+ )
908
+ ].iloc[0]
909
+
910
+ st.write(
911
+ f'Number of words: {len(response_details_fixed["output_1"].split())}'
912
+ )
913
+
914
+ # Display the response string
915
+ st.info(response_details_fixed["output_1"])
916
+
917
+ with col2:
918
+ st.session_state.selected_output_human_annotations = st.selectbox(
919
+ "Output 2",
920
+ df_human_annotations.loc[
921
+ df_human_annotations["instruction"]
922
+ == st.session_state.selected_instruction_human_annotations
923
+ ]["output_2"]
924
+ .dropna()
925
+ .tolist(),
926
+ key="output_selector_human_annotations",
927
+ index=0,
928
+ # label_visibility="collapsed",
929
+ )
930
+
931
+ # Get the response string for the selected model
932
+ if (
933
+ st.session_state.selected_output_human_annotations
934
+ and st.session_state.selected_instruction_human_annotations
935
+ ):
936
+ response_details_dynamic = df_human_annotations[
937
+ (
938
+ df_human_annotations["instruction"]
939
+ == st.session_state.selected_instruction_human_annotations
940
+ )
941
+ & (
942
+ df_human_annotations["output_2"]
943
+ == st.session_state.selected_output_human_annotations
944
+ )
945
+ ].iloc[0]
946
+
947
+ st.write(
948
+ f'Number of words: {len(response_details_dynamic["output_2"].split())}'
949
+ )
950
+ st.info(response_details_dynamic["output_2"])
951
+
952
+ st.divider()
953
+
954
+ # Judging details.
955
+ st.markdown(f"#### Human Judging")
956
+
957
+ col1, col2 = st.columns(2)
958
+
959
+ with col1:
960
+
961
+ judging_details = df_human_annotations[
962
+ (df_human_annotations["output_1"] == response_details_fixed["output_1"])
963
+ & (
964
+ df_human_annotations["output_2"]
965
+ == response_details_dynamic["output_2"]
966
+ )
967
+ ]
968
+ judging_details["assigned_preference"] = judging_details[
969
+ "preference"
970
+ ].apply(get_preference_from_rounded_score)
971
+ is_unanimous_value = is_unanimous(judging_details["preference"])
972
+ st.write("**Unanimous?** ", is_unanimous_value)
973
+
974
+ # Draw a histogram of preference.
975
+ fig = px.histogram(
976
+ judging_details,
977
+ x="assigned_preference",
978
+ )
979
+ fig.update_layout(xaxis_title="Preference")
980
+ st.plotly_chart(fig)
981
+
982
+ with st.expander("Data details"):
983
+ st.dataframe(
984
+ judging_details[["annotator_index", "assigned_preference"]],
985
+ hide_index=True,
986
+ )
987
+
988
+ # Generate the heatmap figure
989
+ with col2:
990
+ agreement_matrix = create_agreement_heatmap(df_human_annotations)
991
+ # st.write(
992
+ # f"**Overall interannotator agreement:** {agreement_matrix.mean().mean():.3f}"
993
+ # )
994
+ with st.expander(
995
+ f"**Overall interannotator agreement:** {agreement_matrix.mean().mean():.3f}"
996
+ ):
997
+ st.pyplot(plt)
998
+
999
 
1000
  if __name__ == "__main__":
1001
  app()
data/alpaca_farm_human_crossannotations.json ADDED
The diff for this file is too large to render. See raw diff