VAPO_data_demo

Sleeping

App Files Files Community

Dongfu Jiang commited on Jul 31

Commit

83fe087

•

1 Parent(s): a0937f6

udpate

Browse files

Files changed (7) hide show

.gitignore +3 -0
__pycache__/constants.cpython-310.pyc +0 -0
__pycache__/data_utils.cpython-310.pyc +0 -0
__pycache__/themes.cpython-310.pyc +0 -0
__pycache__/utils_display.cpython-310.pyc +0 -0
_header.md +1 -1
app.py +105 -84

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+__pycache__*
+test*
+/data_dir

__pycache__/constants.cpython-310.pyc DELETED Viewed

Binary file (4.66 kB)

__pycache__/data_utils.cpython-310.pyc DELETED Viewed

Binary file (5.05 kB)

__pycache__/themes.cpython-310.pyc DELETED Viewed

Binary file (1.38 kB)

__pycache__/utils_display.cpython-310.pyc DELETED Viewed

Binary file (1.34 kB)

_header.md CHANGED Viewed

@@ -1,4 +1,4 @@
 <br/>
-# WildFeedback data demo


1	<br/>
2
3	+ # VAPO data demo
4

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ import pandas as pd
 import gradio as gr
 import pandas as pd
 from pathlib import Path
 import json
 from constants import BANNER, CITATION_TEXT, WINRATE_HEATMAP, css, js_code, all_task_types, DEFAULT_LP, TASK_TYPE_STR, js_light
 from datetime import datetime, timezone
@@ -39,68 +40,76 @@ eval_results = load_eval_results()
 available_models = [] # to be filled in later
-dataset = datasets.load_dataset("DongfuJiang/WildFeedback", "feedbacks", split='train')
 import random
 random.seed(42)
 np.random.seed(42)
-def sample_an_feedback(selected_models):
-    feedback = None
-    all_valid_feedbacks = []
-    all_valid_model_responses = []
-    for example in tqdm(dataset, total=len(dataset), desc="Searching for valid examples"):
-        example_model_responses = example['responses']
-        valid_model_responses = [model_response for model_response in example_model_responses if model_response['model'] in selected_models]
-        if len(valid_model_responses) >= 2:
-            all_valid_feedbacks.append(example)
-            all_valid_model_responses.append(random.sample(valid_model_responses, 2))
-    if len(all_valid_feedbacks) == 0:
-        return gr.Exit("No valid examples found. Please select other models.")
-    random_idx = random.randint(0, len(all_valid_feedbacks) - 1)
-    feedback = all_valid_feedbacks[random_idx]
-    model_response_1, model_response_2 = all_valid_model_responses[random_idx]
     plan_history = {
         "user": [
-            feedback['query'],
-            "Please give the feedback (query GPT-4o-mini)"
         ],
         "assistant": [
-            model_response_1['response'],
-            model_response_2['feedback']['raw']
         ]
     }
     ground_history = {
         "user": [
-            feedback['query'],
-            "Please give the feedback (query GPT-4o-mini)"
         ],
         "assistant": [
-            model_response_2['response'],
-            model_response_2['feedback']['raw']
         ]
     }
     result_dict = {
-        "session_id": feedback['id'],
-        "task": feedback['source'],
-        "task_type": feedback['source'],
         "plan_history": plan_history,
         "ground_history": ground_history,
         # "pred": str(model_response_1['feedback']['processed']['score']) if model_response_1['feedback']['processed'] else "A",
         # "answer": str(model_response_2['feedback']['processed']['score']) if model_response_2['feedback']['processed'] else "A",
-        "pred": str(model_response_1['model']),
-        "answer": str(model_response_2['model']),
-        "correctness": "GPT-4o-mini",
         "image": "file/data_dir/test_images/000000341196.jpg"
     }
     return result_dict
-def display_chat_history(model_selections):
-    # eval_item = sample_an_eval_result(eval_results, model_selections)
-    eval_item = sample_an_feedback(model_selections)
     print("---" * 10)
     for key, value in eval_item.items():
         print(f"{key}: {value}")
@@ -108,11 +117,13 @@ def display_chat_history(model_selections):
     # eval_item = sample_an_feedback()
     session_id = eval_item["session_id"]
-    task = eval_item["task"]
-    task_type = eval_item["task_type"]
     prediction = eval_item["pred"]
     gold_answer = eval_item["answer"]
     correctness = eval_item["correctness"]
     if eval_item["image"]:
         image_path = eval_item["image"]
@@ -126,20 +137,22 @@ def display_chat_history(model_selections):
         chats_ground += [item_user, item_asst]
     chats_plan = [(chats_plan[i], chats_plan[i+1]) for i in range(0, len(chats_plan), 2)]
     chats_ground = [(chats_ground[i], chats_ground[i+1]) for i in range(0, len(chats_ground), 2)]
-    task_metadata = f"- 🆔: `{session_id}` \n- **Task category**: {task_type}"
-    print(f"Task: {task}")
-    print(f"Plan History: {chats_plan}")
-    print(f"Ground History: {chats_ground}")
-    print(f"Task Metadata: {task_metadata}")
-    print(f"Prediction: {prediction}")
-    print(f"Gold Answer: {gold_answer}")
-    print(f"Correctness: {correctness}")
     if image_path != "":
         image = f'<div style="text-align: center;"> <img src="{image_path}" style="height: 250px;"> </div>'
-        return task, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, image
     else:
-        return task, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, f'<div style="text-align: center;"> </div>'
@@ -167,9 +180,10 @@ def slider_change_full(length_penalty, show_winrate):
         adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4")
     return adjusted_df
 seafoam = Seafoam()
 def build_demo(TYPES):
-    global original_df, ablation_df, skip_empty_original_df, skip_empty_ablation_df, available_models
     with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
         gr.Markdown(HEADER_MD, elem_classes="markdown-text")
@@ -181,13 +195,29 @@ def build_demo(TYPES):
                 with gr.Row():
                     with gr.Column():
-                        with gr.Accordion("Choose models to sample from", open=False, elem_classes="accordion-label"):
-                            model_options = available_models
-                            selected_models = gr.CheckboxGroup(model_options, info="", value=model_options, show_label=False, elem_id="select-models")
                             clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
                             # clear the selected_models
-                            clear_button.click(lambda: {selected_models: {"value": [], "__type__": "update"}}, inputs=[], outputs=[selected_models])
                 with gr.Row(visible=False):
                     with gr.Column(scale=1.5):
                         with gr.Accordion("📝 Task Description", open=True, elem_classes="accordion-label"):
@@ -208,35 +238,45 @@ def build_demo(TYPES):
                 with gr.Row():
                     with gr.Column(scale=1.1):
                         # gr.Markdown("## 📢 Plan Module Process History w/ <span style='background-color: #FDFDBA;'>Execution Module Results</span>", elem_classes="accordion-label")
-                        gr.Markdown("## 📢 Model A's response and feedback", elem_classes="accordion-label")
                         Chatbot_Common_Plan = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Plan History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
                         Chatbot_Common_Plan.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
                     with gr.Column(scale=1):
                         # gr.Markdown("## 📢 Ground Module Process History", elem_classes="accordion-label")
-                        gr.Markdown("## 📢 Model B's response and feedback", elem_classes="accordion-label")
                         Chatbot_Common_Ground = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Ground History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
                         Chatbot_Common_Ground.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
                 with gr.Row():
                     with gr.Column():
                         # with gr.Accordion("🙋 Prediction", open=True, elem_classes="accordion-label"):
-                        with gr.Accordion("Model A Name", open=True, elem_classes="accordion-label"):
                             prediction = gr.Markdown("", elem_classes="markdown-text-tiny")
                             prediction.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
                     with gr.Column():
                         # with gr.Accordion("🔑 Ground-Truth Answer", open=True, elem_classes="accordion-label"):
-                        with gr.Accordion("Model B Name", open=True, elem_classes="accordion-label"):
                             gold_answer = gr.HTML("", elem_id="markdown-text-tiny")
                             gold_answer.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
                     with gr.Column(visible=True):
-                        with gr.Accordion("Feedback Model Name", open=True, elem_classes="accordion-label"):
                             correctness = gr.HTML("", elem_id="markdown-text-tiny")
                             correctness.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
                 # Display chat history when button is clicked
-                btn_show_history.click(fn=display_chat_history, inputs=[selected_models], outputs=[task, Chatbot_Common_Plan, Chatbot_Common_Ground, task_metadata, prediction, gold_answer, correctness, image])
             with gr.TabItem("📮 About Us", elem_id="od-benchmark-tab-table", id=3, visible=False):
                 gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
@@ -259,43 +299,24 @@ def build_demo(TYPES):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--share", action="store_true")
-    parser.add_argument("--result_file", help="Path to results table", default="data_dir/elo_ranks.all.jsonl")
     parser.add_argument("--length_balation_file", help="Path to results table", default="data_dir/elo_ranks.length_ablation.all.jsonl")
     parser.add_argument("--skip_empty_result_file", help="Path to results table", default="data_dir/elo_ranks.skip_empty.all.jsonl")
     parser.add_argument("--skip_empty_length_balation_file", help="Path to results table", default="data_dir/elo_ranks.skip_empty.length_ablation.all.jsonl")
     args = parser.parse_args()
     LAST_UPDATED = datetime.fromtimestamp(Path(args.result_file).stat().st_mtime, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
-    original_df = pd.read_json(args.result_file , lines=True)
-    ablation_df = pd.read_json(args.length_balation_file, lines=True)
-    skip_empty_original_df = pd.read_json(args.skip_empty_result_file , lines=True)
-    skip_empty_ablation_df = pd.read_json(args.skip_empty_length_balation_file, lines=True)
     # available_models = sorted(list(set(list(original_df["model name "]))))
     # available_models = list(model_info.keys())
-    available_models = set()
-    for example in dataset:
-        for model_response in example['responses']:
-            available_models.add(model_response['model'])
-    available_models = sorted(list(available_models))
-    # remove the rows where the model name is not in the available_models
-    original_df = original_df[original_df["model name "].isin(available_models)]
-    ablation_df = ablation_df[ablation_df["model name "].isin(available_models)]
-    skip_empty_ablation_df = skip_empty_ablation_df[skip_empty_ablation_df["model name "].isin(available_models)]
-    skip_empty_original_df = skip_empty_original_df[skip_empty_original_df["model name "].isin(available_models)]
-    model_len_info = json.load(open("model_len_info.json", "r"))
-    original_df = post_processing(original_df, model_len_info)
-    ablation_df = post_processing(ablation_df, model_len_info)
-    skip_empty_original_df = post_processing(skip_empty_original_df, model_len_info)
-    skip_empty_ablation_df = post_processing(skip_empty_ablation_df, model_len_info)
     TYPES = ["markdown", "number"]

 import gradio as gr
 import pandas as pd
 from pathlib import Path
+from difflib import Differ
 import json
 from constants import BANNER, CITATION_TEXT, WINRATE_HEATMAP, css, js_code, all_task_types, DEFAULT_LP, TASK_TYPE_STR, js_light
 from datetime import datetime, timezone
 available_models = [] # to be filled in later
 import random
 random.seed(42)
 np.random.seed(42)
+def sample_an_feedback(task_category, task_difficulty, task_quality, feedback_score):
+    def filter_examples(item):
+        if task_category and item['category'] not in task_category:
+            return False
+        if task_difficulty and item['difficulty'] not in task_difficulty:
+            return False
+        if task_quality and item['quality'] not in task_quality:
+            return False
+        if feedback_score and item['feedback']['processed']['score'] not in feedback_score:
+            return False
+        return True
+    valid_examples = dataset.filter(filter_examples, num_proc=4)
+    if len(valid_examples) == 0:
+        raise ValueError("No examples found for the selected filters. Please try again with different filters.")
+    print(f"Found {len(valid_examples)} examples for the selected filters.")
+    example = random.choice(valid_examples)
     plan_history = {
         "user": [
+            example['query'],
         ],
         "assistant": [
+            example['response']
         ]
     }
     ground_history = {
         "user": [
+            example['query'],
         ],
         "assistant": [
+            example['revision']['processed']
         ]
     }
     result_dict = {
+        "session_id": example['id'],
+        "category": example['category'],
+        "difficulty": example['difficulty'],
+        "quality": example['quality'],
+        "intent": example['intent'],
         "plan_history": plan_history,
         "ground_history": ground_history,
         # "pred": str(model_response_1['feedback']['processed']['score']) if model_response_1['feedback']['processed'] else "A",
         # "answer": str(model_response_2['feedback']['processed']['score']) if model_response_2['feedback']['processed'] else "A",
+        "pred": example['model'], # model that generates the original response
+        "answer": example['revision']['model'], # model that generates the revised response
+        "correctness": example['feedback']['model'], # model that generates the feedback for the original response
         "image": "file/data_dir/test_images/000000341196.jpg"
     }
     return result_dict
+def diff_texts(text1, text2):
+    d = Differ()
+    return [
+        (token[2:], token[0] if token[0] != " " else None)
+        for token in d.compare(text1, text2)
+    ]
+def display_chat_history(task_category, task_difficulty, task_quality, feedback_score):
+    eval_item = sample_an_feedback(task_category, task_difficulty, task_quality, feedback_score)
     print("---" * 10)
     for key, value in eval_item.items():
         print(f"{key}: {value}")
     # eval_item = sample_an_feedback()
     session_id = eval_item["session_id"]
+    category = eval_item["category"]
     prediction = eval_item["pred"]
     gold_answer = eval_item["answer"]
     correctness = eval_item["correctness"]
+    difficulty = eval_item["difficulty"]
+    quality = eval_item["quality"]
+    intent = eval_item["intent"]
     if eval_item["image"]:
         image_path = eval_item["image"]
         chats_ground += [item_user, item_asst]
     chats_plan = [(chats_plan[i], chats_plan[i+1]) for i in range(0, len(chats_plan), 2)]
     chats_ground = [(chats_ground[i], chats_ground[i+1]) for i in range(0, len(chats_ground), 2)]
+    task_metadata = f"- 🆔: `{session_id}` \n- **Category**: {category} \n- **Difficulty**: {difficulty} \n- **Quality**: {quality} \n- **Intent**: {intent}"
+    diff_text = diff_texts(chats_plan[-1][1], chats_ground[-1][1])
+    print(f"Category: {category}")
+    print(f"Difficulty: {difficulty}")
+    print(f"Quality: {quality}")
+    print(f"Intent: {intent}")
+    print(f"Session ID: {session_id}")
+    print(f"Original Response: {chats_plan}")
+    print(f"Revised Response: {chats_ground}")
     if image_path != "":
         image = f'<div style="text-align: center;"> <img src="{image_path}" style="height: 250px;"> </div>'
+        return category, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, image, diff_text
     else:
+        return category, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, f'<div style="text-align: center;"> </div>', diff_text
         adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4")
     return adjusted_df
 seafoam = Seafoam()
 def build_demo(TYPES):
+    global available_categories, avaliable_difficulty, avaliable_quality, available_feedback_scores
     with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
         gr.Markdown(HEADER_MD, elem_classes="markdown-text")
                 with gr.Row():
                     with gr.Column():
+                        with gr.Accordion("Choose task difficulty", open=False, elem_classes="accordion-label"):
+                            task_difficulty = gr.CheckboxGroup(avaliable_difficulty, info="", value=avaliable_difficulty, show_label=False, elem_id="select-difficulty")
                             clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
                             # clear the selected_models
+                            clear_button.click(lambda: {task_difficulty: {"value": [], "__type__": "update"}}, inputs=[], outputs=[task_difficulty])
+                        with gr.Accordion("Choose task quality", open=False, elem_classes="accordion-label"):
+                            task_quality = gr.CheckboxGroup(avaliable_quality, info="", value=avaliable_quality, show_label=False, elem_id="select-quality")
+                            clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
+                            # clear the selected_models
+                            clear_button.click(lambda: {task_quality: {"value": [], "__type__": "update"}}, inputs=[], outputs=[task_quality])
+                        with gr.Accordion("Choose feedback score", open=False, elem_classes="accordion-label"):
+                            feedback_score = gr.CheckboxGroup(available_feedback_scores, info="", value=available_feedback_scores, show_label=False, elem_id="select-feedback")
+                            clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
+                            # clear the selected_models
+                            clear_button.click(lambda: {feedback_score: {"value": [], "__type__": "update"}}, inputs=[], outputs=[feedback_score])
+                        with gr.Accordion("Choose task category", open=False, elem_classes="accordion-label"):
+                            task_category = gr.CheckboxGroup(available_categories, info="", value=available_categories, show_label=False, elem_id="select-category")
+                            clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
+                            # clear the selected_models
+                            clear_button.click(lambda: {task_category: {"value": [], "__type__": "update"}}, inputs=[], outputs=[task_category])
                 with gr.Row(visible=False):
                     with gr.Column(scale=1.5):
                         with gr.Accordion("📝 Task Description", open=True, elem_classes="accordion-label"):
                 with gr.Row():
                     with gr.Column(scale=1.1):
                         # gr.Markdown("## 📢 Plan Module Process History w/ <span style='background-color: #FDFDBA;'>Execution Module Results</span>", elem_classes="accordion-label")
+                        gr.Markdown("## 📢 Model Original Response", elem_classes="accordion-label")
                         Chatbot_Common_Plan = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Plan History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
                         Chatbot_Common_Plan.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
                     with gr.Column(scale=1):
                         # gr.Markdown("## 📢 Ground Module Process History", elem_classes="accordion-label")
+                        gr.Markdown("## 📢 Model Revised Response", elem_classes="accordion-label")
                         Chatbot_Common_Ground = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Ground History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
                         Chatbot_Common_Ground.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
+                with gr.Row():
+                    with gr.Column():
+                        with gr.Accordion("Highlighted differences", open=True, elem_classes="accordion-label"):
+                            highlighted_diff = gr.HighlightedText(label="Highlighted differences",
+                                                            combine_adjacent=False,
+                                                            show_legend=True,
+                                                            color_map={"+": "green", "-": "red"})
                 with gr.Row():
                     with gr.Column():
                         # with gr.Accordion("🙋 Prediction", open=True, elem_classes="accordion-label"):
+                        with gr.Accordion("Policy Model", open=True, elem_classes="accordion-label"):
                             prediction = gr.Markdown("", elem_classes="markdown-text-tiny")
                             prediction.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
                     with gr.Column():
                         # with gr.Accordion("🔑 Ground-Truth Answer", open=True, elem_classes="accordion-label"):
+                        with gr.Accordion("Revision Model", open=True, elem_classes="accordion-label"):
                             gold_answer = gr.HTML("", elem_id="markdown-text-tiny")
                             gold_answer.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
                     with gr.Column(visible=True):
+                        with gr.Accordion("Feedback Model", open=True, elem_classes="accordion-label"):
                             correctness = gr.HTML("", elem_id="markdown-text-tiny")
                             correctness.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
                 # Display chat history when button is clicked
+                btn_show_history.click(fn=display_chat_history,
+                                       inputs=[task_category, task_difficulty, task_quality, feedback_score],
+                                       outputs=[task, Chatbot_Common_Plan, Chatbot_Common_Ground, task_metadata, prediction, gold_answer, correctness, image, highlighted_diff])
             with gr.TabItem("📮 About Us", elem_id="od-benchmark-tab-table", id=3, visible=False):
                 gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--share", action="store_true")
+    parser.add_argument("--result_file", help="Path to results table", default="data_dir/pair_feedbacks_1.jsonl")
     parser.add_argument("--length_balation_file", help="Path to results table", default="data_dir/elo_ranks.length_ablation.all.jsonl")
     parser.add_argument("--skip_empty_result_file", help="Path to results table", default="data_dir/elo_ranks.skip_empty.all.jsonl")
     parser.add_argument("--skip_empty_length_balation_file", help="Path to results table", default="data_dir/elo_ranks.skip_empty.length_ablation.all.jsonl")
     args = parser.parse_args()
     LAST_UPDATED = datetime.fromtimestamp(Path(args.result_file).stat().st_mtime, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
     # available_models = sorted(list(set(list(original_df["model name "]))))
     # available_models = list(model_info.keys())
+    # dataset = datasets.Dataset.from_json(args.result_file)
+    dataset = datasets.load_dataset("DongfuJiang/VAPO", "pair_feedback_iter_1", split='train')
+    avaliable_difficulty = sorted(list(set(dataset['difficulty'])))
+    avaliable_quality = sorted(list(set(dataset['quality'])))
+    available_feedback_scores = sorted(list(set([item['feedback']['processed']['score'] for item in dataset])))
+    available_categories = sorted(list(set(dataset['category'])))
     TYPES = ["markdown", "number"]