Dongfu Jiang commited on
Commit
83fe087
โ€ข
1 Parent(s): a0937f6
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ __pycache__*
2
+ test*
3
+ /data_dir
__pycache__/constants.cpython-310.pyc DELETED
Binary file (4.66 kB)
 
__pycache__/data_utils.cpython-310.pyc DELETED
Binary file (5.05 kB)
 
__pycache__/themes.cpython-310.pyc DELETED
Binary file (1.38 kB)
 
__pycache__/utils_display.cpython-310.pyc DELETED
Binary file (1.34 kB)
 
_header.md CHANGED
@@ -1,4 +1,4 @@
1
  <br/>
2
 
3
- # WildFeedback data demo
4
 
 
1
  <br/>
2
 
3
+ # VAPO data demo
4
 
app.py CHANGED
@@ -10,6 +10,7 @@ import pandas as pd
10
  import gradio as gr
11
  import pandas as pd
12
  from pathlib import Path
 
13
  import json
14
  from constants import BANNER, CITATION_TEXT, WINRATE_HEATMAP, css, js_code, all_task_types, DEFAULT_LP, TASK_TYPE_STR, js_light
15
  from datetime import datetime, timezone
@@ -39,68 +40,76 @@ eval_results = load_eval_results()
39
 
40
  available_models = [] # to be filled in later
41
 
42
- dataset = datasets.load_dataset("DongfuJiang/WildFeedback", "feedbacks", split='train')
43
 
44
  import random
45
  random.seed(42)
46
  np.random.seed(42)
47
- def sample_an_feedback(selected_models):
48
- feedback = None
49
- all_valid_feedbacks = []
50
- all_valid_model_responses = []
51
- for example in tqdm(dataset, total=len(dataset), desc="Searching for valid examples"):
52
- example_model_responses = example['responses']
53
- valid_model_responses = [model_response for model_response in example_model_responses if model_response['model'] in selected_models]
54
- if len(valid_model_responses) >= 2:
55
- all_valid_feedbacks.append(example)
56
- all_valid_model_responses.append(random.sample(valid_model_responses, 2))
57
- if len(all_valid_feedbacks) == 0:
58
- return gr.Exit("No valid examples found. Please select other models.")
59
 
60
- random_idx = random.randint(0, len(all_valid_feedbacks) - 1)
61
- feedback = all_valid_feedbacks[random_idx]
62
- model_response_1, model_response_2 = all_valid_model_responses[random_idx]
 
 
 
 
63
 
64
  plan_history = {
65
  "user": [
66
- feedback['query'],
67
- "Please give the feedback (query GPT-4o-mini)"
68
  ],
69
  "assistant": [
70
- model_response_1['response'],
71
- model_response_2['feedback']['raw']
72
  ]
73
  }
74
 
75
  ground_history = {
76
  "user": [
77
- feedback['query'],
78
- "Please give the feedback (query GPT-4o-mini)"
79
  ],
80
  "assistant": [
81
- model_response_2['response'],
82
- model_response_2['feedback']['raw']
83
  ]
84
  }
85
 
86
  result_dict = {
87
- "session_id": feedback['id'],
88
- "task": feedback['source'],
89
- "task_type": feedback['source'],
 
 
90
  "plan_history": plan_history,
91
  "ground_history": ground_history,
92
  # "pred": str(model_response_1['feedback']['processed']['score']) if model_response_1['feedback']['processed'] else "A",
93
  # "answer": str(model_response_2['feedback']['processed']['score']) if model_response_2['feedback']['processed'] else "A",
94
- "pred": str(model_response_1['model']),
95
- "answer": str(model_response_2['model']),
96
- "correctness": "GPT-4o-mini",
97
  "image": "file/data_dir/test_images/000000341196.jpg"
98
  }
99
  return result_dict
100
-
101
- def display_chat_history(model_selections):
102
- # eval_item = sample_an_eval_result(eval_results, model_selections)
103
- eval_item = sample_an_feedback(model_selections)
 
 
 
 
 
 
 
104
  print("---" * 10)
105
  for key, value in eval_item.items():
106
  print(f"{key}: {value}")
@@ -108,11 +117,13 @@ def display_chat_history(model_selections):
108
 
109
  # eval_item = sample_an_feedback()
110
  session_id = eval_item["session_id"]
111
- task = eval_item["task"]
112
- task_type = eval_item["task_type"]
113
  prediction = eval_item["pred"]
114
  gold_answer = eval_item["answer"]
115
  correctness = eval_item["correctness"]
 
 
 
116
 
117
  if eval_item["image"]:
118
  image_path = eval_item["image"]
@@ -126,20 +137,22 @@ def display_chat_history(model_selections):
126
  chats_ground += [item_user, item_asst]
127
  chats_plan = [(chats_plan[i], chats_plan[i+1]) for i in range(0, len(chats_plan), 2)]
128
  chats_ground = [(chats_ground[i], chats_ground[i+1]) for i in range(0, len(chats_ground), 2)]
129
- task_metadata = f"- ๐Ÿ†”: `{session_id}` \n- **Task category**: {task_type}"
 
 
130
 
131
- print(f"Task: {task}")
132
- print(f"Plan History: {chats_plan}")
133
- print(f"Ground History: {chats_ground}")
134
- print(f"Task Metadata: {task_metadata}")
135
- print(f"Prediction: {prediction}")
136
- print(f"Gold Answer: {gold_answer}")
137
- print(f"Correctness: {correctness}")
138
  if image_path != "":
139
  image = f'<div style="text-align: center;"> <img src="{image_path}" style="height: 250px;"> </div>'
140
- return task, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, image
141
  else:
142
- return task, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, f'<div style="text-align: center;"> </div>'
143
 
144
 
145
 
@@ -167,9 +180,10 @@ def slider_change_full(length_penalty, show_winrate):
167
  adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4")
168
  return adjusted_df
169
 
 
170
  seafoam = Seafoam()
171
  def build_demo(TYPES):
172
- global original_df, ablation_df, skip_empty_original_df, skip_empty_ablation_df, available_models
173
  with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
174
  gr.Markdown(HEADER_MD, elem_classes="markdown-text")
175
 
@@ -181,13 +195,29 @@ def build_demo(TYPES):
181
 
182
  with gr.Row():
183
  with gr.Column():
184
- with gr.Accordion("Choose models to sample from", open=False, elem_classes="accordion-label"):
185
- model_options = available_models
186
- selected_models = gr.CheckboxGroup(model_options, info="", value=model_options, show_label=False, elem_id="select-models")
187
  clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
188
  # clear the selected_models
189
- clear_button.click(lambda: {selected_models: {"value": [], "__type__": "update"}}, inputs=[], outputs=[selected_models])
190
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  with gr.Row(visible=False):
192
  with gr.Column(scale=1.5):
193
  with gr.Accordion("๐Ÿ“ Task Description", open=True, elem_classes="accordion-label"):
@@ -208,35 +238,45 @@ def build_demo(TYPES):
208
  with gr.Row():
209
  with gr.Column(scale=1.1):
210
  # gr.Markdown("## ๐Ÿ“ข Plan Module Process History w/ <span style='background-color: #FDFDBA;'>Execution Module Results</span>", elem_classes="accordion-label")
211
- gr.Markdown("## ๐Ÿ“ข Model A's response and feedback", elem_classes="accordion-label")
212
  Chatbot_Common_Plan = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Plan History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
213
  Chatbot_Common_Plan.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
214
  with gr.Column(scale=1):
215
  # gr.Markdown("## ๐Ÿ“ข Ground Module Process History", elem_classes="accordion-label")
216
- gr.Markdown("## ๐Ÿ“ข Model B's response and feedback", elem_classes="accordion-label")
217
  Chatbot_Common_Ground = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Ground History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
218
  Chatbot_Common_Ground.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
219
 
 
 
 
 
 
 
 
 
220
  with gr.Row():
221
  with gr.Column():
222
  # with gr.Accordion("๐Ÿ™‹ Prediction", open=True, elem_classes="accordion-label"):
223
- with gr.Accordion("Model A Name", open=True, elem_classes="accordion-label"):
224
  prediction = gr.Markdown("", elem_classes="markdown-text-tiny")
225
  prediction.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
226
 
227
  with gr.Column():
228
  # with gr.Accordion("๐Ÿ”‘ Ground-Truth Answer", open=True, elem_classes="accordion-label"):
229
- with gr.Accordion("Model B Name", open=True, elem_classes="accordion-label"):
230
  gold_answer = gr.HTML("", elem_id="markdown-text-tiny")
231
  gold_answer.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
232
 
233
  with gr.Column(visible=True):
234
- with gr.Accordion("Feedback Model Name", open=True, elem_classes="accordion-label"):
235
  correctness = gr.HTML("", elem_id="markdown-text-tiny")
236
  correctness.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
237
 
238
  # Display chat history when button is clicked
239
- btn_show_history.click(fn=display_chat_history, inputs=[selected_models], outputs=[task, Chatbot_Common_Plan, Chatbot_Common_Ground, task_metadata, prediction, gold_answer, correctness, image])
 
 
240
 
241
  with gr.TabItem("๐Ÿ“ฎ About Us", elem_id="od-benchmark-tab-table", id=3, visible=False):
242
  gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
@@ -259,43 +299,24 @@ def build_demo(TYPES):
259
  if __name__ == "__main__":
260
  parser = argparse.ArgumentParser()
261
  parser.add_argument("--share", action="store_true")
262
- parser.add_argument("--result_file", help="Path to results table", default="data_dir/elo_ranks.all.jsonl")
263
  parser.add_argument("--length_balation_file", help="Path to results table", default="data_dir/elo_ranks.length_ablation.all.jsonl")
264
  parser.add_argument("--skip_empty_result_file", help="Path to results table", default="data_dir/elo_ranks.skip_empty.all.jsonl")
265
  parser.add_argument("--skip_empty_length_balation_file", help="Path to results table", default="data_dir/elo_ranks.skip_empty.length_ablation.all.jsonl")
266
  args = parser.parse_args()
267
 
268
  LAST_UPDATED = datetime.fromtimestamp(Path(args.result_file).stat().st_mtime, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
269
-
270
- original_df = pd.read_json(args.result_file , lines=True)
271
- ablation_df = pd.read_json(args.length_balation_file, lines=True)
272
- skip_empty_original_df = pd.read_json(args.skip_empty_result_file , lines=True)
273
- skip_empty_ablation_df = pd.read_json(args.skip_empty_length_balation_file, lines=True)
274
-
275
 
276
  # available_models = sorted(list(set(list(original_df["model name "]))))
277
  # available_models = list(model_info.keys())
278
 
279
- available_models = set()
280
- for example in dataset:
281
- for model_response in example['responses']:
282
- available_models.add(model_response['model'])
283
- available_models = sorted(list(available_models))
284
-
285
- # remove the rows where the model name is not in the available_models
286
- original_df = original_df[original_df["model name "].isin(available_models)]
287
- ablation_df = ablation_df[ablation_df["model name "].isin(available_models)]
288
- skip_empty_ablation_df = skip_empty_ablation_df[skip_empty_ablation_df["model name "].isin(available_models)]
289
- skip_empty_original_df = skip_empty_original_df[skip_empty_original_df["model name "].isin(available_models)]
290
-
291
- model_len_info = json.load(open("model_len_info.json", "r"))
292
-
293
- original_df = post_processing(original_df, model_len_info)
294
- ablation_df = post_processing(ablation_df, model_len_info)
295
- skip_empty_original_df = post_processing(skip_empty_original_df, model_len_info)
296
- skip_empty_ablation_df = post_processing(skip_empty_ablation_df, model_len_info)
297
 
298
-
 
 
 
299
 
300
 
301
  TYPES = ["markdown", "number"]
 
10
  import gradio as gr
11
  import pandas as pd
12
  from pathlib import Path
13
+ from difflib import Differ
14
  import json
15
  from constants import BANNER, CITATION_TEXT, WINRATE_HEATMAP, css, js_code, all_task_types, DEFAULT_LP, TASK_TYPE_STR, js_light
16
  from datetime import datetime, timezone
 
40
 
41
  available_models = [] # to be filled in later
42
 
 
43
 
44
  import random
45
  random.seed(42)
46
  np.random.seed(42)
47
+ def sample_an_feedback(task_category, task_difficulty, task_quality, feedback_score):
48
+
49
+ def filter_examples(item):
50
+ if task_category and item['category'] not in task_category:
51
+ return False
52
+ if task_difficulty and item['difficulty'] not in task_difficulty:
53
+ return False
54
+ if task_quality and item['quality'] not in task_quality:
55
+ return False
56
+ if feedback_score and item['feedback']['processed']['score'] not in feedback_score:
57
+ return False
58
+ return True
59
 
60
+ valid_examples = dataset.filter(filter_examples, num_proc=4)
61
+
62
+ if len(valid_examples) == 0:
63
+ raise ValueError("No examples found for the selected filters. Please try again with different filters.")
64
+ print(f"Found {len(valid_examples)} examples for the selected filters.")
65
+
66
+ example = random.choice(valid_examples)
67
 
68
  plan_history = {
69
  "user": [
70
+ example['query'],
 
71
  ],
72
  "assistant": [
73
+ example['response']
 
74
  ]
75
  }
76
 
77
  ground_history = {
78
  "user": [
79
+ example['query'],
 
80
  ],
81
  "assistant": [
82
+ example['revision']['processed']
 
83
  ]
84
  }
85
 
86
  result_dict = {
87
+ "session_id": example['id'],
88
+ "category": example['category'],
89
+ "difficulty": example['difficulty'],
90
+ "quality": example['quality'],
91
+ "intent": example['intent'],
92
  "plan_history": plan_history,
93
  "ground_history": ground_history,
94
  # "pred": str(model_response_1['feedback']['processed']['score']) if model_response_1['feedback']['processed'] else "A",
95
  # "answer": str(model_response_2['feedback']['processed']['score']) if model_response_2['feedback']['processed'] else "A",
96
+ "pred": example['model'], # model that generates the original response
97
+ "answer": example['revision']['model'], # model that generates the revised response
98
+ "correctness": example['feedback']['model'], # model that generates the feedback for the original response
99
  "image": "file/data_dir/test_images/000000341196.jpg"
100
  }
101
  return result_dict
102
+
103
+
104
+ def diff_texts(text1, text2):
105
+ d = Differ()
106
+ return [
107
+ (token[2:], token[0] if token[0] != " " else None)
108
+ for token in d.compare(text1, text2)
109
+ ]
110
+
111
+ def display_chat_history(task_category, task_difficulty, task_quality, feedback_score):
112
+ eval_item = sample_an_feedback(task_category, task_difficulty, task_quality, feedback_score)
113
  print("---" * 10)
114
  for key, value in eval_item.items():
115
  print(f"{key}: {value}")
 
117
 
118
  # eval_item = sample_an_feedback()
119
  session_id = eval_item["session_id"]
120
+ category = eval_item["category"]
 
121
  prediction = eval_item["pred"]
122
  gold_answer = eval_item["answer"]
123
  correctness = eval_item["correctness"]
124
+ difficulty = eval_item["difficulty"]
125
+ quality = eval_item["quality"]
126
+ intent = eval_item["intent"]
127
 
128
  if eval_item["image"]:
129
  image_path = eval_item["image"]
 
137
  chats_ground += [item_user, item_asst]
138
  chats_plan = [(chats_plan[i], chats_plan[i+1]) for i in range(0, len(chats_plan), 2)]
139
  chats_ground = [(chats_ground[i], chats_ground[i+1]) for i in range(0, len(chats_ground), 2)]
140
+ task_metadata = f"- ๐Ÿ†”: `{session_id}` \n- **Category**: {category} \n- **Difficulty**: {difficulty} \n- **Quality**: {quality} \n- **Intent**: {intent}"
141
+
142
+ diff_text = diff_texts(chats_plan[-1][1], chats_ground[-1][1])
143
 
144
+ print(f"Category: {category}")
145
+ print(f"Difficulty: {difficulty}")
146
+ print(f"Quality: {quality}")
147
+ print(f"Intent: {intent}")
148
+ print(f"Session ID: {session_id}")
149
+ print(f"Original Response: {chats_plan}")
150
+ print(f"Revised Response: {chats_ground}")
151
  if image_path != "":
152
  image = f'<div style="text-align: center;"> <img src="{image_path}" style="height: 250px;"> </div>'
153
+ return category, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, image, diff_text
154
  else:
155
+ return category, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, f'<div style="text-align: center;"> </div>', diff_text
156
 
157
 
158
 
 
180
  adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4")
181
  return adjusted_df
182
 
183
+
184
  seafoam = Seafoam()
185
  def build_demo(TYPES):
186
+ global available_categories, avaliable_difficulty, avaliable_quality, available_feedback_scores
187
  with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
188
  gr.Markdown(HEADER_MD, elem_classes="markdown-text")
189
 
 
195
 
196
  with gr.Row():
197
  with gr.Column():
198
+
199
+ with gr.Accordion("Choose task difficulty", open=False, elem_classes="accordion-label"):
200
+ task_difficulty = gr.CheckboxGroup(avaliable_difficulty, info="", value=avaliable_difficulty, show_label=False, elem_id="select-difficulty")
201
  clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
202
  # clear the selected_models
203
+ clear_button.click(lambda: {task_difficulty: {"value": [], "__type__": "update"}}, inputs=[], outputs=[task_difficulty])
204
+ with gr.Accordion("Choose task quality", open=False, elem_classes="accordion-label"):
205
+ task_quality = gr.CheckboxGroup(avaliable_quality, info="", value=avaliable_quality, show_label=False, elem_id="select-quality")
206
+ clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
207
+ # clear the selected_models
208
+ clear_button.click(lambda: {task_quality: {"value": [], "__type__": "update"}}, inputs=[], outputs=[task_quality])
209
+ with gr.Accordion("Choose feedback score", open=False, elem_classes="accordion-label"):
210
+ feedback_score = gr.CheckboxGroup(available_feedback_scores, info="", value=available_feedback_scores, show_label=False, elem_id="select-feedback")
211
+ clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
212
+ # clear the selected_models
213
+ clear_button.click(lambda: {feedback_score: {"value": [], "__type__": "update"}}, inputs=[], outputs=[feedback_score])
214
+
215
+ with gr.Accordion("Choose task category", open=False, elem_classes="accordion-label"):
216
+ task_category = gr.CheckboxGroup(available_categories, info="", value=available_categories, show_label=False, elem_id="select-category")
217
+ clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
218
+ # clear the selected_models
219
+ clear_button.click(lambda: {task_category: {"value": [], "__type__": "update"}}, inputs=[], outputs=[task_category])
220
+
221
  with gr.Row(visible=False):
222
  with gr.Column(scale=1.5):
223
  with gr.Accordion("๐Ÿ“ Task Description", open=True, elem_classes="accordion-label"):
 
238
  with gr.Row():
239
  with gr.Column(scale=1.1):
240
  # gr.Markdown("## ๐Ÿ“ข Plan Module Process History w/ <span style='background-color: #FDFDBA;'>Execution Module Results</span>", elem_classes="accordion-label")
241
+ gr.Markdown("## ๐Ÿ“ข Model Original Response", elem_classes="accordion-label")
242
  Chatbot_Common_Plan = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Plan History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
243
  Chatbot_Common_Plan.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
244
  with gr.Column(scale=1):
245
  # gr.Markdown("## ๐Ÿ“ข Ground Module Process History", elem_classes="accordion-label")
246
+ gr.Markdown("## ๐Ÿ“ข Model Revised Response", elem_classes="accordion-label")
247
  Chatbot_Common_Ground = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Ground History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
248
  Chatbot_Common_Ground.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
249
 
250
+ with gr.Row():
251
+ with gr.Column():
252
+ with gr.Accordion("Highlighted differences", open=True, elem_classes="accordion-label"):
253
+ highlighted_diff = gr.HighlightedText(label="Highlighted differences",
254
+ combine_adjacent=False,
255
+ show_legend=True,
256
+ color_map={"+": "green", "-": "red"})
257
+
258
  with gr.Row():
259
  with gr.Column():
260
  # with gr.Accordion("๐Ÿ™‹ Prediction", open=True, elem_classes="accordion-label"):
261
+ with gr.Accordion("Policy Model", open=True, elem_classes="accordion-label"):
262
  prediction = gr.Markdown("", elem_classes="markdown-text-tiny")
263
  prediction.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
264
 
265
  with gr.Column():
266
  # with gr.Accordion("๐Ÿ”‘ Ground-Truth Answer", open=True, elem_classes="accordion-label"):
267
+ with gr.Accordion("Revision Model", open=True, elem_classes="accordion-label"):
268
  gold_answer = gr.HTML("", elem_id="markdown-text-tiny")
269
  gold_answer.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
270
 
271
  with gr.Column(visible=True):
272
+ with gr.Accordion("Feedback Model", open=True, elem_classes="accordion-label"):
273
  correctness = gr.HTML("", elem_id="markdown-text-tiny")
274
  correctness.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
275
 
276
  # Display chat history when button is clicked
277
+ btn_show_history.click(fn=display_chat_history,
278
+ inputs=[task_category, task_difficulty, task_quality, feedback_score],
279
+ outputs=[task, Chatbot_Common_Plan, Chatbot_Common_Ground, task_metadata, prediction, gold_answer, correctness, image, highlighted_diff])
280
 
281
  with gr.TabItem("๐Ÿ“ฎ About Us", elem_id="od-benchmark-tab-table", id=3, visible=False):
282
  gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
 
299
  if __name__ == "__main__":
300
  parser = argparse.ArgumentParser()
301
  parser.add_argument("--share", action="store_true")
302
+ parser.add_argument("--result_file", help="Path to results table", default="data_dir/pair_feedbacks_1.jsonl")
303
  parser.add_argument("--length_balation_file", help="Path to results table", default="data_dir/elo_ranks.length_ablation.all.jsonl")
304
  parser.add_argument("--skip_empty_result_file", help="Path to results table", default="data_dir/elo_ranks.skip_empty.all.jsonl")
305
  parser.add_argument("--skip_empty_length_balation_file", help="Path to results table", default="data_dir/elo_ranks.skip_empty.length_ablation.all.jsonl")
306
  args = parser.parse_args()
307
 
308
  LAST_UPDATED = datetime.fromtimestamp(Path(args.result_file).stat().st_mtime, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
 
 
 
 
 
 
309
 
310
  # available_models = sorted(list(set(list(original_df["model name "]))))
311
  # available_models = list(model_info.keys())
312
 
313
+ # dataset = datasets.Dataset.from_json(args.result_file)
314
+ dataset = datasets.load_dataset("DongfuJiang/VAPO", "pair_feedback_iter_1", split='train')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
 
316
+ avaliable_difficulty = sorted(list(set(dataset['difficulty'])))
317
+ avaliable_quality = sorted(list(set(dataset['quality'])))
318
+ available_feedback_scores = sorted(list(set([item['feedback']['processed']['score'] for item in dataset])))
319
+ available_categories = sorted(list(set(dataset['category'])))
320
 
321
 
322
  TYPES = ["markdown", "number"]