Spaces:
Running
Running
File size: 6,011 Bytes
1c919b3 3d2e59d 1c919b3 3d2e59d 1c919b3 1757118 1c919b3 3d2e59d dbd69c2 3d2e59d d06e2e9 c1a5b93 3d2e59d 0f9e3cb 3d2e59d 0f9e3cb 3d2e59d c1a5b93 3d2e59d c1a5b93 3d2e59d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
from datasets import load_dataset, Dataset
import os
from datasets import load_dataset
from datasets.utils.logging import disable_progress_bar
from constants import column_names, RANKING_COLUMN, ORDERED_COLUMN_NAMES
from utils_display import make_clickable_model
import random
disable_progress_bar()
import math
import json
from tqdm import tqdm
import numpy as np
import os
from eval_utils import *
summary_file = "ZeroEval-main/result_dirs/zebra-grid.summary.json"
result_dir = "ZeroEval-main/result_dirs/zebra-grid/"
results_by_model = {}
# Formats the columns
def formatter(x):
if type(x) is str:
x = x
else:
x = round(x, 1)
return x
def post_processing(df, column_names, rank_column=RANKING_COLUMN, ordered_columns=ORDERED_COLUMN_NAMES, click_url=True):
for col in df.columns:
if col == "Model" and click_url:
df[col] = df[col].apply(lambda x: x.replace(x, make_clickable_model(x)))
else:
df[col] = df[col].apply(formatter) # For numerical values
df.rename(columns=column_names, inplace=True)
list_columns = [col for col in ordered_columns if col in df.columns]
df = df[list_columns]
if rank_column in df.columns:
df.sort_values(by=rank_column, inplace=True, ascending=False)
return df
def load_all_data():
global summary_file, result_dir
with open(summary_file, "r") as f:
model_summary = json.load(f)
model_names = [model["Model"] for model in model_summary]
for model_name in model_names:
try:
model_rename_map = {
"Llama-3.1-405B-Inst-fp8@together": "Llama-3.1-405B-Instruct-Turbo",
"Llama-3.1-405B-Inst@hyperbolic": "Meta-Llama-3.1-405B-Instruct@hyperbolic",
"deepseek-v2-chat-0628": "deepseek-v2-chat",
"deepseek-v2-coder-0724": "DeepSeek-Coder-V2-0724",
"deepseek-v2-coder-0614": "deepseek-v2-coder",
"gemma-2-9b-it": "gemma-2-9b-it@nvidia",
"gemma-2-27b-it": "gemma-2-27b-it@nvidia"
}
if model_name in model_rename_map:
model_name = model_rename_map[model_name]
download_url = f"https://raw.githubusercontent.com/WildEval/ZeroEval/refs/heads/main/result_dirs/zebra-grid/{model_name}.json"
output_file = os.path.join(result_dir, f"{model_name}.json")
# mkdir -p result_dir if not exists
os.makedirs(result_dir, exist_ok=True)
if not os.path.exists(output_file):
os.system(f"wget {download_url} -O {output_file}")
print(f"Downloaded {model_name}.json")
with open(output_file, "r") as f:
print(f"Loading {output_file}")
results_by_model[model_name] = json.load(f)
except Exception as e:
print(f"Error loading {model_name}: {e}")
continue
def get_random_item(model_name="random", size_H="random", size_W="random"):
global summary_file, result_dir, results_by_model
if results_by_model is None or len(results_by_model) == 0:
load_all_data()
if model_name == "random":
model_name = random.choice(list(results_by_model.keys()))
data = results_by_model[model_name]
random.shuffle(data)
selected_item = None
prediction_table = None
prediction_reasoning = None
id_to_item = {}
for item in data:
id_to_item[item["id"]] = item
if size_H == "random":
size_H_choice = random.choice(list(range(2, 7)))
else:
size_H_choice = size_H
if size_W == "random":
size_W_choice = random.choice(list(range(2, 7)))
else:
size_W_choice = size_W
ok_ids = [id for id in id_to_item if id_to_item[id]["size"].startswith(f"{size_H_choice}*{size_W_choice}")]
for ok_id in ok_ids:
item = id_to_item[ok_id]
prediction_str = item["output"][0]
prediction_json = extract_last_complete_json(prediction_str)
if prediction_json is None or "solution" not in prediction_json:
continue
if "child" in item["puzzle"].lower() or "mother" in item["puzzle"].lower():
continue
if "loves the spaghetti eater" in item["puzzle"].lower():
continue
prediction_reasoning = prediction_json.get("reasoning", "")
prediction_table = prediction_json["solution"]
if prediction_table is not None and "House 1" in prediction_table:
selected_item = item
break
if selected_item is None:
# selected_item = random.choice(data)
print("No item found!")
return None
explore_item = {}
explore_item["id"] = selected_item["id"]
explore_item["Model"] = model_name
explore_item["size"] = selected_item["size"]
explore_item["puzzle"] = selected_item["puzzle"]
explore_item["solution"] = prediction_table
explore_item["reasoning"] = prediction_reasoning
headers = ["Houses"] + list(prediction_table["House 1"].keys())
rows = []
for row_id in range(len(prediction_table)):
row = [row_id+1]
for feature in headers[1:]:
row.append(prediction_table[f"House {row_id+1}"][feature])
rows.append(row)
table_md = tabulate(rows, headers=headers, tablefmt="github")
explore_item["solution_table_md"] = table_md
this_total_cells, this_correct_cells, truth_solution_table = eval_each_puzzle(explore_item["id"], prediction_table)
# print(table_md)
explore_item["correct_cells"] = this_correct_cells
explore_item["total_cells"] = this_total_cells
explore_item["truth_solution_table"] = tabulate(truth_solution_table["rows"], headers=truth_solution_table["header"], tablefmt="github")
return explore_item
if __name__ == "__main__":
load_all_data()
print("All data downloaded!")
print(json.dumps(get_random_item(model_name="gemini-1.5-pro", size_H="2", size_W="5"), indent=2)) |