justinxzhao
commited on
Commit
•
a129336
1
Parent(s):
ae3759c
Add analysis graphs, and add color coding to interpersonal conflicts data samples.
Browse files- app.py +80 -22
- img/council_normalized.png +0 -0
- img/judge_agreement.sidewise_cohen_kappa.png +0 -0
- img/llm_vs_llm_win_rates.png +0 -0
- img/raw.png +0 -0
app.py
CHANGED
@@ -51,14 +51,6 @@ def pil_to_base64(img):
|
|
51 |
return img_str
|
52 |
|
53 |
|
54 |
-
# Function to convert PIL image to base64
|
55 |
-
def pil_svg_to_base64(img):
|
56 |
-
buffered = BytesIO()
|
57 |
-
img.save(buffered, format="SVG")
|
58 |
-
img_str = base64.b64encode(buffered.getvalue()).decode()
|
59 |
-
return img_str
|
60 |
-
|
61 |
-
|
62 |
# Load your dataframes
|
63 |
df_test_set = pd.read_json("data/test_set.jsonl", lines=True)
|
64 |
df_responses = pd.read_json("data/responses.jsonl", lines=True)
|
@@ -82,7 +74,7 @@ model_options = df_responses["llm_responder"].unique().tolist()
|
|
82 |
# Prepare the judge selector options
|
83 |
judge_options = df_response_judging["llm_judge"].unique().tolist()
|
84 |
|
85 |
-
st.set_page_config(page_title="Language Model Council", page_icon="
|
86 |
|
87 |
# Create three columns
|
88 |
col1, col2, col3 = st.columns(3)
|
@@ -142,7 +134,7 @@ st.markdown(center_css, unsafe_allow_html=True)
|
|
142 |
# st.markdown(centered_image_html, unsafe_allow_html=True)
|
143 |
|
144 |
# Title and subtitle.
|
145 |
-
st.title("Language Model Council")
|
146 |
st.markdown(
|
147 |
"###### Benchmarking Foundation Models on Highly Subjective Tasks by Consensus :classical_building:"
|
148 |
)
|
@@ -179,11 +171,19 @@ st.markdown(
|
|
179 |
)
|
180 |
|
181 |
# Create horizontal tabs
|
182 |
-
tabs = st.tabs(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
|
184 |
# Define content for each tab
|
185 |
with tabs[0]:
|
186 |
-
st.
|
|
|
187 |
|
188 |
|
189 |
# HTML and CSS to create a text box with specified color
|
@@ -193,7 +193,7 @@ def colored_text_box(text, background_color, text_color="black"):
|
|
193 |
background-color: {background_color};
|
194 |
color: {text_color};
|
195 |
padding: 10px;
|
196 |
-
border-radius:
|
197 |
">
|
198 |
{text}
|
199 |
</div>
|
@@ -263,15 +263,21 @@ with tabs[1]:
|
|
263 |
# Display the detailed dilemma and additional information
|
264 |
st.markdown(
|
265 |
colored_text_box(
|
266 |
-
scenario_details["detailed_dilemma"],
|
|
|
|
|
267 |
),
|
268 |
unsafe_allow_html=True,
|
269 |
)
|
270 |
with st.expander("Additional Information"):
|
271 |
-
st.write(
|
272 |
-
|
273 |
-
|
274 |
-
|
|
|
|
|
|
|
|
|
275 |
|
276 |
st.divider()
|
277 |
|
@@ -296,7 +302,9 @@ with tabs[1]:
|
|
296 |
# Display the response string
|
297 |
st.markdown(
|
298 |
colored_text_box(
|
299 |
-
response_details_fixed["response_string"],
|
|
|
|
|
300 |
),
|
301 |
unsafe_allow_html=True,
|
302 |
)
|
@@ -324,7 +332,9 @@ with tabs[1]:
|
|
324 |
# Display the response string
|
325 |
st.markdown(
|
326 |
colored_text_box(
|
327 |
-
response_details_dynamic["response_string"],
|
|
|
|
|
328 |
),
|
329 |
unsafe_allow_html=True,
|
330 |
)
|
@@ -414,7 +424,7 @@ with tabs[1]:
|
|
414 |
st.markdown(
|
415 |
colored_text_box(
|
416 |
judging_details_left["judging_response_string"],
|
417 |
-
"#
|
418 |
"black",
|
419 |
),
|
420 |
unsafe_allow_html=True,
|
@@ -430,7 +440,7 @@ with tabs[1]:
|
|
430 |
st.markdown(
|
431 |
colored_text_box(
|
432 |
judging_details_right["judging_response_string"],
|
433 |
-
"#
|
434 |
"black",
|
435 |
),
|
436 |
unsafe_allow_html=True,
|
@@ -439,6 +449,54 @@ with tabs[1]:
|
|
439 |
st.write("No judging details found for the selected combination.")
|
440 |
|
441 |
with tabs[2]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
442 |
st.write(
|
443 |
"""
|
444 |
Please reach out if you are interested in collaborating!
|
|
|
51 |
return img_str
|
52 |
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
# Load your dataframes
|
55 |
df_test_set = pd.read_json("data/test_set.jsonl", lines=True)
|
56 |
df_responses = pd.read_json("data/responses.jsonl", lines=True)
|
|
|
74 |
# Prepare the judge selector options
|
75 |
judge_options = df_response_judging["llm_judge"].unique().tolist()
|
76 |
|
77 |
+
st.set_page_config(page_title="Language Model Council", page_icon="🏛️", layout="wide")
|
78 |
|
79 |
# Create three columns
|
80 |
col1, col2, col3 = st.columns(3)
|
|
|
134 |
# st.markdown(centered_image_html, unsafe_allow_html=True)
|
135 |
|
136 |
# Title and subtitle.
|
137 |
+
st.title("🗳️ Language Model Council")
|
138 |
st.markdown(
|
139 |
"###### Benchmarking Foundation Models on Highly Subjective Tasks by Consensus :classical_building:"
|
140 |
)
|
|
|
171 |
)
|
172 |
|
173 |
# Create horizontal tabs
|
174 |
+
tabs = st.tabs(
|
175 |
+
[
|
176 |
+
"Leaderboard Results",
|
177 |
+
"Interpersonal Conflicts",
|
178 |
+
"Analysis",
|
179 |
+
"About Us",
|
180 |
+
]
|
181 |
+
)
|
182 |
|
183 |
# Define content for each tab
|
184 |
with tabs[0]:
|
185 |
+
_, mid_column, _ = st.columns([0.2, 0.6, 0.2])
|
186 |
+
mid_column.dataframe(df_leaderboard)
|
187 |
|
188 |
|
189 |
# HTML and CSS to create a text box with specified color
|
|
|
193 |
background-color: {background_color};
|
194 |
color: {text_color};
|
195 |
padding: 10px;
|
196 |
+
border-radius: 10px;
|
197 |
">
|
198 |
{text}
|
199 |
</div>
|
|
|
263 |
# Display the detailed dilemma and additional information
|
264 |
st.markdown(
|
265 |
colored_text_box(
|
266 |
+
scenario_details["detailed_dilemma"],
|
267 |
+
"#01204E",
|
268 |
+
"white",
|
269 |
),
|
270 |
unsafe_allow_html=True,
|
271 |
)
|
272 |
with st.expander("Additional Information"):
|
273 |
+
st.write(
|
274 |
+
{
|
275 |
+
"LLM Author": scenario_details["llm_author"],
|
276 |
+
"Problem": scenario_details["problem"],
|
277 |
+
"Relationship": scenario_details["relationship"],
|
278 |
+
"Scenario": scenario_details["scenario"],
|
279 |
+
}
|
280 |
+
)
|
281 |
|
282 |
st.divider()
|
283 |
|
|
|
302 |
# Display the response string
|
303 |
st.markdown(
|
304 |
colored_text_box(
|
305 |
+
response_details_fixed["response_string"],
|
306 |
+
"#028391",
|
307 |
+
"white",
|
308 |
),
|
309 |
unsafe_allow_html=True,
|
310 |
)
|
|
|
332 |
# Display the response string
|
333 |
st.markdown(
|
334 |
colored_text_box(
|
335 |
+
response_details_dynamic["response_string"],
|
336 |
+
"#028391",
|
337 |
+
"white",
|
338 |
),
|
339 |
unsafe_allow_html=True,
|
340 |
)
|
|
|
424 |
st.markdown(
|
425 |
colored_text_box(
|
426 |
judging_details_left["judging_response_string"],
|
427 |
+
"#FEAE6F",
|
428 |
"black",
|
429 |
),
|
430 |
unsafe_allow_html=True,
|
|
|
440 |
st.markdown(
|
441 |
colored_text_box(
|
442 |
judging_details_right["judging_response_string"],
|
443 |
+
"#FEAE6F",
|
444 |
"black",
|
445 |
),
|
446 |
unsafe_allow_html=True,
|
|
|
449 |
st.write("No judging details found for the selected combination.")
|
450 |
|
451 |
with tabs[2]:
|
452 |
+
st.markdown("### Battles (Respondent vs. Respondent)")
|
453 |
+
st.write("Expected win rates based on Terry-Bradley coefficients:")
|
454 |
+
image = Image.open("img/llm_vs_llm_win_rates.png")
|
455 |
+
img_base64 = pil_to_base64(image)
|
456 |
+
centered_image_html = f"""
|
457 |
+
<div style="text-align: center;">
|
458 |
+
<img src="data:image/png;base64,{img_base64}" width="1000"/>
|
459 |
+
</div>
|
460 |
+
"""
|
461 |
+
st.markdown(centered_image_html, unsafe_allow_html=True)
|
462 |
+
|
463 |
+
st.markdown("### Affinities (Judge vs. Respondent)")
|
464 |
+
|
465 |
+
st.write("Raw affinities:")
|
466 |
+
image = Image.open("img/raw.png")
|
467 |
+
img_base64 = pil_to_base64(image)
|
468 |
+
centered_image_html = f"""
|
469 |
+
<div style="text-align: center;">
|
470 |
+
<img src="data:image/png;base64,{img_base64}" width="1000"/>
|
471 |
+
</div>
|
472 |
+
"""
|
473 |
+
st.markdown(centered_image_html, unsafe_allow_html=True)
|
474 |
+
|
475 |
+
st.write("Council-Normalized:")
|
476 |
+
image = Image.open("img/council_normalized.png")
|
477 |
+
img_base64 = pil_to_base64(image)
|
478 |
+
centered_image_html = f"""
|
479 |
+
<div style="text-align: center;">
|
480 |
+
<img src="data:image/png;base64,{img_base64}" width="1000"/>
|
481 |
+
</div>
|
482 |
+
"""
|
483 |
+
st.markdown(centered_image_html, unsafe_allow_html=True)
|
484 |
+
|
485 |
+
st.markdown("### Agreement (Judge vs. Judge)")
|
486 |
+
|
487 |
+
st.write("Sidewise Cohen's Kappa:")
|
488 |
+
image = Image.open("img/judge_agreement.sidewise_cohen_kappa.png")
|
489 |
+
img_base64 = pil_to_base64(image)
|
490 |
+
centered_image_html = f"""
|
491 |
+
<div style="text-align: center;">
|
492 |
+
<img src="data:image/png;base64,{img_base64}" width="1000"/>
|
493 |
+
</div>
|
494 |
+
"""
|
495 |
+
st.markdown(centered_image_html, unsafe_allow_html=True)
|
496 |
+
|
497 |
+
st.write("Check out the paper for more detailed analysis!")
|
498 |
+
|
499 |
+
with tabs[-1]:
|
500 |
st.write(
|
501 |
"""
|
502 |
Please reach out if you are interested in collaborating!
|
img/council_normalized.png
ADDED
img/judge_agreement.sidewise_cohen_kappa.png
ADDED
img/llm_vs_llm_win_rates.png
ADDED
img/raw.png
ADDED