Spaces:
Sleeping
Sleeping
Dongfu Jiang
commited on
Commit
โข
83fe087
1
Parent(s):
a0937f6
udpate
Browse files- .gitignore +3 -0
- __pycache__/constants.cpython-310.pyc +0 -0
- __pycache__/data_utils.cpython-310.pyc +0 -0
- __pycache__/themes.cpython-310.pyc +0 -0
- __pycache__/utils_display.cpython-310.pyc +0 -0
- _header.md +1 -1
- app.py +105 -84
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__*
|
2 |
+
test*
|
3 |
+
/data_dir
|
__pycache__/constants.cpython-310.pyc
DELETED
Binary file (4.66 kB)
|
|
__pycache__/data_utils.cpython-310.pyc
DELETED
Binary file (5.05 kB)
|
|
__pycache__/themes.cpython-310.pyc
DELETED
Binary file (1.38 kB)
|
|
__pycache__/utils_display.cpython-310.pyc
DELETED
Binary file (1.34 kB)
|
|
_header.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
<br/>
|
2 |
|
3 |
-
#
|
4 |
|
|
|
1 |
<br/>
|
2 |
|
3 |
+
# VAPO data demo
|
4 |
|
app.py
CHANGED
@@ -10,6 +10,7 @@ import pandas as pd
|
|
10 |
import gradio as gr
|
11 |
import pandas as pd
|
12 |
from pathlib import Path
|
|
|
13 |
import json
|
14 |
from constants import BANNER, CITATION_TEXT, WINRATE_HEATMAP, css, js_code, all_task_types, DEFAULT_LP, TASK_TYPE_STR, js_light
|
15 |
from datetime import datetime, timezone
|
@@ -39,68 +40,76 @@ eval_results = load_eval_results()
|
|
39 |
|
40 |
available_models = [] # to be filled in later
|
41 |
|
42 |
-
dataset = datasets.load_dataset("DongfuJiang/WildFeedback", "feedbacks", split='train')
|
43 |
|
44 |
import random
|
45 |
random.seed(42)
|
46 |
np.random.seed(42)
|
47 |
-
def sample_an_feedback(
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
if
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
return
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
|
|
|
|
|
|
63 |
|
64 |
plan_history = {
|
65 |
"user": [
|
66 |
-
|
67 |
-
"Please give the feedback (query GPT-4o-mini)"
|
68 |
],
|
69 |
"assistant": [
|
70 |
-
|
71 |
-
model_response_2['feedback']['raw']
|
72 |
]
|
73 |
}
|
74 |
|
75 |
ground_history = {
|
76 |
"user": [
|
77 |
-
|
78 |
-
"Please give the feedback (query GPT-4o-mini)"
|
79 |
],
|
80 |
"assistant": [
|
81 |
-
|
82 |
-
model_response_2['feedback']['raw']
|
83 |
]
|
84 |
}
|
85 |
|
86 |
result_dict = {
|
87 |
-
"session_id":
|
88 |
-
"
|
89 |
-
"
|
|
|
|
|
90 |
"plan_history": plan_history,
|
91 |
"ground_history": ground_history,
|
92 |
# "pred": str(model_response_1['feedback']['processed']['score']) if model_response_1['feedback']['processed'] else "A",
|
93 |
# "answer": str(model_response_2['feedback']['processed']['score']) if model_response_2['feedback']['processed'] else "A",
|
94 |
-
"pred":
|
95 |
-
"answer":
|
96 |
-
"correctness":
|
97 |
"image": "file/data_dir/test_images/000000341196.jpg"
|
98 |
}
|
99 |
return result_dict
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
print("---" * 10)
|
105 |
for key, value in eval_item.items():
|
106 |
print(f"{key}: {value}")
|
@@ -108,11 +117,13 @@ def display_chat_history(model_selections):
|
|
108 |
|
109 |
# eval_item = sample_an_feedback()
|
110 |
session_id = eval_item["session_id"]
|
111 |
-
|
112 |
-
task_type = eval_item["task_type"]
|
113 |
prediction = eval_item["pred"]
|
114 |
gold_answer = eval_item["answer"]
|
115 |
correctness = eval_item["correctness"]
|
|
|
|
|
|
|
116 |
|
117 |
if eval_item["image"]:
|
118 |
image_path = eval_item["image"]
|
@@ -126,20 +137,22 @@ def display_chat_history(model_selections):
|
|
126 |
chats_ground += [item_user, item_asst]
|
127 |
chats_plan = [(chats_plan[i], chats_plan[i+1]) for i in range(0, len(chats_plan), 2)]
|
128 |
chats_ground = [(chats_ground[i], chats_ground[i+1]) for i in range(0, len(chats_ground), 2)]
|
129 |
-
task_metadata = f"- ๐: `{session_id}` \n- **
|
|
|
|
|
130 |
|
131 |
-
print(f"
|
132 |
-
print(f"
|
133 |
-
print(f"
|
134 |
-
print(f"
|
135 |
-
print(f"
|
136 |
-
print(f"
|
137 |
-
print(f"
|
138 |
if image_path != "":
|
139 |
image = f'<div style="text-align: center;"> <img src="{image_path}" style="height: 250px;"> </div>'
|
140 |
-
return
|
141 |
else:
|
142 |
-
return
|
143 |
|
144 |
|
145 |
|
@@ -167,9 +180,10 @@ def slider_change_full(length_penalty, show_winrate):
|
|
167 |
adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4")
|
168 |
return adjusted_df
|
169 |
|
|
|
170 |
seafoam = Seafoam()
|
171 |
def build_demo(TYPES):
|
172 |
-
global
|
173 |
with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
|
174 |
gr.Markdown(HEADER_MD, elem_classes="markdown-text")
|
175 |
|
@@ -181,13 +195,29 @@ def build_demo(TYPES):
|
|
181 |
|
182 |
with gr.Row():
|
183 |
with gr.Column():
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
|
188 |
# clear the selected_models
|
189 |
-
clear_button.click(lambda: {
|
190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
with gr.Row(visible=False):
|
192 |
with gr.Column(scale=1.5):
|
193 |
with gr.Accordion("๐ Task Description", open=True, elem_classes="accordion-label"):
|
@@ -208,35 +238,45 @@ def build_demo(TYPES):
|
|
208 |
with gr.Row():
|
209 |
with gr.Column(scale=1.1):
|
210 |
# gr.Markdown("## ๐ข Plan Module Process History w/ <span style='background-color: #FDFDBA;'>Execution Module Results</span>", elem_classes="accordion-label")
|
211 |
-
gr.Markdown("## ๐ข Model
|
212 |
Chatbot_Common_Plan = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Plan History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
|
213 |
Chatbot_Common_Plan.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
214 |
with gr.Column(scale=1):
|
215 |
# gr.Markdown("## ๐ข Ground Module Process History", elem_classes="accordion-label")
|
216 |
-
gr.Markdown("## ๐ข Model
|
217 |
Chatbot_Common_Ground = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Ground History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
|
218 |
Chatbot_Common_Ground.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
219 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
with gr.Row():
|
221 |
with gr.Column():
|
222 |
# with gr.Accordion("๐ Prediction", open=True, elem_classes="accordion-label"):
|
223 |
-
with gr.Accordion("Model
|
224 |
prediction = gr.Markdown("", elem_classes="markdown-text-tiny")
|
225 |
prediction.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
226 |
|
227 |
with gr.Column():
|
228 |
# with gr.Accordion("๐ Ground-Truth Answer", open=True, elem_classes="accordion-label"):
|
229 |
-
with gr.Accordion("Model
|
230 |
gold_answer = gr.HTML("", elem_id="markdown-text-tiny")
|
231 |
gold_answer.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
232 |
|
233 |
with gr.Column(visible=True):
|
234 |
-
with gr.Accordion("Feedback Model
|
235 |
correctness = gr.HTML("", elem_id="markdown-text-tiny")
|
236 |
correctness.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
237 |
|
238 |
# Display chat history when button is clicked
|
239 |
-
btn_show_history.click(fn=display_chat_history,
|
|
|
|
|
240 |
|
241 |
with gr.TabItem("๐ฎ About Us", elem_id="od-benchmark-tab-table", id=3, visible=False):
|
242 |
gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
|
@@ -259,43 +299,24 @@ def build_demo(TYPES):
|
|
259 |
if __name__ == "__main__":
|
260 |
parser = argparse.ArgumentParser()
|
261 |
parser.add_argument("--share", action="store_true")
|
262 |
-
parser.add_argument("--result_file", help="Path to results table", default="data_dir/
|
263 |
parser.add_argument("--length_balation_file", help="Path to results table", default="data_dir/elo_ranks.length_ablation.all.jsonl")
|
264 |
parser.add_argument("--skip_empty_result_file", help="Path to results table", default="data_dir/elo_ranks.skip_empty.all.jsonl")
|
265 |
parser.add_argument("--skip_empty_length_balation_file", help="Path to results table", default="data_dir/elo_ranks.skip_empty.length_ablation.all.jsonl")
|
266 |
args = parser.parse_args()
|
267 |
|
268 |
LAST_UPDATED = datetime.fromtimestamp(Path(args.result_file).stat().st_mtime, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
|
269 |
-
|
270 |
-
original_df = pd.read_json(args.result_file , lines=True)
|
271 |
-
ablation_df = pd.read_json(args.length_balation_file, lines=True)
|
272 |
-
skip_empty_original_df = pd.read_json(args.skip_empty_result_file , lines=True)
|
273 |
-
skip_empty_ablation_df = pd.read_json(args.skip_empty_length_balation_file, lines=True)
|
274 |
-
|
275 |
|
276 |
# available_models = sorted(list(set(list(original_df["model name "]))))
|
277 |
# available_models = list(model_info.keys())
|
278 |
|
279 |
-
|
280 |
-
|
281 |
-
for model_response in example['responses']:
|
282 |
-
available_models.add(model_response['model'])
|
283 |
-
available_models = sorted(list(available_models))
|
284 |
-
|
285 |
-
# remove the rows where the model name is not in the available_models
|
286 |
-
original_df = original_df[original_df["model name "].isin(available_models)]
|
287 |
-
ablation_df = ablation_df[ablation_df["model name "].isin(available_models)]
|
288 |
-
skip_empty_ablation_df = skip_empty_ablation_df[skip_empty_ablation_df["model name "].isin(available_models)]
|
289 |
-
skip_empty_original_df = skip_empty_original_df[skip_empty_original_df["model name "].isin(available_models)]
|
290 |
-
|
291 |
-
model_len_info = json.load(open("model_len_info.json", "r"))
|
292 |
-
|
293 |
-
original_df = post_processing(original_df, model_len_info)
|
294 |
-
ablation_df = post_processing(ablation_df, model_len_info)
|
295 |
-
skip_empty_original_df = post_processing(skip_empty_original_df, model_len_info)
|
296 |
-
skip_empty_ablation_df = post_processing(skip_empty_ablation_df, model_len_info)
|
297 |
|
298 |
-
|
|
|
|
|
|
|
299 |
|
300 |
|
301 |
TYPES = ["markdown", "number"]
|
|
|
10 |
import gradio as gr
|
11 |
import pandas as pd
|
12 |
from pathlib import Path
|
13 |
+
from difflib import Differ
|
14 |
import json
|
15 |
from constants import BANNER, CITATION_TEXT, WINRATE_HEATMAP, css, js_code, all_task_types, DEFAULT_LP, TASK_TYPE_STR, js_light
|
16 |
from datetime import datetime, timezone
|
|
|
40 |
|
41 |
available_models = [] # to be filled in later
|
42 |
|
|
|
43 |
|
44 |
import random
|
45 |
random.seed(42)
|
46 |
np.random.seed(42)
|
47 |
+
def sample_an_feedback(task_category, task_difficulty, task_quality, feedback_score):
|
48 |
+
|
49 |
+
def filter_examples(item):
|
50 |
+
if task_category and item['category'] not in task_category:
|
51 |
+
return False
|
52 |
+
if task_difficulty and item['difficulty'] not in task_difficulty:
|
53 |
+
return False
|
54 |
+
if task_quality and item['quality'] not in task_quality:
|
55 |
+
return False
|
56 |
+
if feedback_score and item['feedback']['processed']['score'] not in feedback_score:
|
57 |
+
return False
|
58 |
+
return True
|
59 |
|
60 |
+
valid_examples = dataset.filter(filter_examples, num_proc=4)
|
61 |
+
|
62 |
+
if len(valid_examples) == 0:
|
63 |
+
raise ValueError("No examples found for the selected filters. Please try again with different filters.")
|
64 |
+
print(f"Found {len(valid_examples)} examples for the selected filters.")
|
65 |
+
|
66 |
+
example = random.choice(valid_examples)
|
67 |
|
68 |
plan_history = {
|
69 |
"user": [
|
70 |
+
example['query'],
|
|
|
71 |
],
|
72 |
"assistant": [
|
73 |
+
example['response']
|
|
|
74 |
]
|
75 |
}
|
76 |
|
77 |
ground_history = {
|
78 |
"user": [
|
79 |
+
example['query'],
|
|
|
80 |
],
|
81 |
"assistant": [
|
82 |
+
example['revision']['processed']
|
|
|
83 |
]
|
84 |
}
|
85 |
|
86 |
result_dict = {
|
87 |
+
"session_id": example['id'],
|
88 |
+
"category": example['category'],
|
89 |
+
"difficulty": example['difficulty'],
|
90 |
+
"quality": example['quality'],
|
91 |
+
"intent": example['intent'],
|
92 |
"plan_history": plan_history,
|
93 |
"ground_history": ground_history,
|
94 |
# "pred": str(model_response_1['feedback']['processed']['score']) if model_response_1['feedback']['processed'] else "A",
|
95 |
# "answer": str(model_response_2['feedback']['processed']['score']) if model_response_2['feedback']['processed'] else "A",
|
96 |
+
"pred": example['model'], # model that generates the original response
|
97 |
+
"answer": example['revision']['model'], # model that generates the revised response
|
98 |
+
"correctness": example['feedback']['model'], # model that generates the feedback for the original response
|
99 |
"image": "file/data_dir/test_images/000000341196.jpg"
|
100 |
}
|
101 |
return result_dict
|
102 |
+
|
103 |
+
|
104 |
+
def diff_texts(text1, text2):
|
105 |
+
d = Differ()
|
106 |
+
return [
|
107 |
+
(token[2:], token[0] if token[0] != " " else None)
|
108 |
+
for token in d.compare(text1, text2)
|
109 |
+
]
|
110 |
+
|
111 |
+
def display_chat_history(task_category, task_difficulty, task_quality, feedback_score):
|
112 |
+
eval_item = sample_an_feedback(task_category, task_difficulty, task_quality, feedback_score)
|
113 |
print("---" * 10)
|
114 |
for key, value in eval_item.items():
|
115 |
print(f"{key}: {value}")
|
|
|
117 |
|
118 |
# eval_item = sample_an_feedback()
|
119 |
session_id = eval_item["session_id"]
|
120 |
+
category = eval_item["category"]
|
|
|
121 |
prediction = eval_item["pred"]
|
122 |
gold_answer = eval_item["answer"]
|
123 |
correctness = eval_item["correctness"]
|
124 |
+
difficulty = eval_item["difficulty"]
|
125 |
+
quality = eval_item["quality"]
|
126 |
+
intent = eval_item["intent"]
|
127 |
|
128 |
if eval_item["image"]:
|
129 |
image_path = eval_item["image"]
|
|
|
137 |
chats_ground += [item_user, item_asst]
|
138 |
chats_plan = [(chats_plan[i], chats_plan[i+1]) for i in range(0, len(chats_plan), 2)]
|
139 |
chats_ground = [(chats_ground[i], chats_ground[i+1]) for i in range(0, len(chats_ground), 2)]
|
140 |
+
task_metadata = f"- ๐: `{session_id}` \n- **Category**: {category} \n- **Difficulty**: {difficulty} \n- **Quality**: {quality} \n- **Intent**: {intent}"
|
141 |
+
|
142 |
+
diff_text = diff_texts(chats_plan[-1][1], chats_ground[-1][1])
|
143 |
|
144 |
+
print(f"Category: {category}")
|
145 |
+
print(f"Difficulty: {difficulty}")
|
146 |
+
print(f"Quality: {quality}")
|
147 |
+
print(f"Intent: {intent}")
|
148 |
+
print(f"Session ID: {session_id}")
|
149 |
+
print(f"Original Response: {chats_plan}")
|
150 |
+
print(f"Revised Response: {chats_ground}")
|
151 |
if image_path != "":
|
152 |
image = f'<div style="text-align: center;"> <img src="{image_path}" style="height: 250px;"> </div>'
|
153 |
+
return category, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, image, diff_text
|
154 |
else:
|
155 |
+
return category, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, f'<div style="text-align: center;"> </div>', diff_text
|
156 |
|
157 |
|
158 |
|
|
|
180 |
adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4")
|
181 |
return adjusted_df
|
182 |
|
183 |
+
|
184 |
seafoam = Seafoam()
|
185 |
def build_demo(TYPES):
|
186 |
+
global available_categories, avaliable_difficulty, avaliable_quality, available_feedback_scores
|
187 |
with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
|
188 |
gr.Markdown(HEADER_MD, elem_classes="markdown-text")
|
189 |
|
|
|
195 |
|
196 |
with gr.Row():
|
197 |
with gr.Column():
|
198 |
+
|
199 |
+
with gr.Accordion("Choose task difficulty", open=False, elem_classes="accordion-label"):
|
200 |
+
task_difficulty = gr.CheckboxGroup(avaliable_difficulty, info="", value=avaliable_difficulty, show_label=False, elem_id="select-difficulty")
|
201 |
clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
|
202 |
# clear the selected_models
|
203 |
+
clear_button.click(lambda: {task_difficulty: {"value": [], "__type__": "update"}}, inputs=[], outputs=[task_difficulty])
|
204 |
+
with gr.Accordion("Choose task quality", open=False, elem_classes="accordion-label"):
|
205 |
+
task_quality = gr.CheckboxGroup(avaliable_quality, info="", value=avaliable_quality, show_label=False, elem_id="select-quality")
|
206 |
+
clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
|
207 |
+
# clear the selected_models
|
208 |
+
clear_button.click(lambda: {task_quality: {"value": [], "__type__": "update"}}, inputs=[], outputs=[task_quality])
|
209 |
+
with gr.Accordion("Choose feedback score", open=False, elem_classes="accordion-label"):
|
210 |
+
feedback_score = gr.CheckboxGroup(available_feedback_scores, info="", value=available_feedback_scores, show_label=False, elem_id="select-feedback")
|
211 |
+
clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
|
212 |
+
# clear the selected_models
|
213 |
+
clear_button.click(lambda: {feedback_score: {"value": [], "__type__": "update"}}, inputs=[], outputs=[feedback_score])
|
214 |
+
|
215 |
+
with gr.Accordion("Choose task category", open=False, elem_classes="accordion-label"):
|
216 |
+
task_category = gr.CheckboxGroup(available_categories, info="", value=available_categories, show_label=False, elem_id="select-category")
|
217 |
+
clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
|
218 |
+
# clear the selected_models
|
219 |
+
clear_button.click(lambda: {task_category: {"value": [], "__type__": "update"}}, inputs=[], outputs=[task_category])
|
220 |
+
|
221 |
with gr.Row(visible=False):
|
222 |
with gr.Column(scale=1.5):
|
223 |
with gr.Accordion("๐ Task Description", open=True, elem_classes="accordion-label"):
|
|
|
238 |
with gr.Row():
|
239 |
with gr.Column(scale=1.1):
|
240 |
# gr.Markdown("## ๐ข Plan Module Process History w/ <span style='background-color: #FDFDBA;'>Execution Module Results</span>", elem_classes="accordion-label")
|
241 |
+
gr.Markdown("## ๐ข Model Original Response", elem_classes="accordion-label")
|
242 |
Chatbot_Common_Plan = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Plan History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
|
243 |
Chatbot_Common_Plan.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
244 |
with gr.Column(scale=1):
|
245 |
# gr.Markdown("## ๐ข Ground Module Process History", elem_classes="accordion-label")
|
246 |
+
gr.Markdown("## ๐ข Model Revised Response", elem_classes="accordion-label")
|
247 |
Chatbot_Common_Ground = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Ground History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
|
248 |
Chatbot_Common_Ground.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
249 |
|
250 |
+
with gr.Row():
|
251 |
+
with gr.Column():
|
252 |
+
with gr.Accordion("Highlighted differences", open=True, elem_classes="accordion-label"):
|
253 |
+
highlighted_diff = gr.HighlightedText(label="Highlighted differences",
|
254 |
+
combine_adjacent=False,
|
255 |
+
show_legend=True,
|
256 |
+
color_map={"+": "green", "-": "red"})
|
257 |
+
|
258 |
with gr.Row():
|
259 |
with gr.Column():
|
260 |
# with gr.Accordion("๐ Prediction", open=True, elem_classes="accordion-label"):
|
261 |
+
with gr.Accordion("Policy Model", open=True, elem_classes="accordion-label"):
|
262 |
prediction = gr.Markdown("", elem_classes="markdown-text-tiny")
|
263 |
prediction.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
264 |
|
265 |
with gr.Column():
|
266 |
# with gr.Accordion("๐ Ground-Truth Answer", open=True, elem_classes="accordion-label"):
|
267 |
+
with gr.Accordion("Revision Model", open=True, elem_classes="accordion-label"):
|
268 |
gold_answer = gr.HTML("", elem_id="markdown-text-tiny")
|
269 |
gold_answer.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
270 |
|
271 |
with gr.Column(visible=True):
|
272 |
+
with gr.Accordion("Feedback Model", open=True, elem_classes="accordion-label"):
|
273 |
correctness = gr.HTML("", elem_id="markdown-text-tiny")
|
274 |
correctness.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
275 |
|
276 |
# Display chat history when button is clicked
|
277 |
+
btn_show_history.click(fn=display_chat_history,
|
278 |
+
inputs=[task_category, task_difficulty, task_quality, feedback_score],
|
279 |
+
outputs=[task, Chatbot_Common_Plan, Chatbot_Common_Ground, task_metadata, prediction, gold_answer, correctness, image, highlighted_diff])
|
280 |
|
281 |
with gr.TabItem("๐ฎ About Us", elem_id="od-benchmark-tab-table", id=3, visible=False):
|
282 |
gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
|
|
|
299 |
if __name__ == "__main__":
|
300 |
parser = argparse.ArgumentParser()
|
301 |
parser.add_argument("--share", action="store_true")
|
302 |
+
parser.add_argument("--result_file", help="Path to results table", default="data_dir/pair_feedbacks_1.jsonl")
|
303 |
parser.add_argument("--length_balation_file", help="Path to results table", default="data_dir/elo_ranks.length_ablation.all.jsonl")
|
304 |
parser.add_argument("--skip_empty_result_file", help="Path to results table", default="data_dir/elo_ranks.skip_empty.all.jsonl")
|
305 |
parser.add_argument("--skip_empty_length_balation_file", help="Path to results table", default="data_dir/elo_ranks.skip_empty.length_ablation.all.jsonl")
|
306 |
args = parser.parse_args()
|
307 |
|
308 |
LAST_UPDATED = datetime.fromtimestamp(Path(args.result_file).stat().st_mtime, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
|
|
|
|
|
|
|
|
|
309 |
|
310 |
# available_models = sorted(list(set(list(original_df["model name "]))))
|
311 |
# available_models = list(model_info.keys())
|
312 |
|
313 |
+
# dataset = datasets.Dataset.from_json(args.result_file)
|
314 |
+
dataset = datasets.load_dataset("DongfuJiang/VAPO", "pair_feedback_iter_1", split='train')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
|
316 |
+
avaliable_difficulty = sorted(list(set(dataset['difficulty'])))
|
317 |
+
avaliable_quality = sorted(list(set(dataset['quality'])))
|
318 |
+
available_feedback_scores = sorted(list(set([item['feedback']['processed']['score'] for item in dataset])))
|
319 |
+
available_categories = sorted(list(set(dataset['category'])))
|
320 |
|
321 |
|
322 |
TYPES = ["markdown", "number"]
|