ZebraLogic / ZeroEval-main /result_dirs /zebra-grid.summary.json
yuchenlin's picture
zebra logic bench
57a9cf2
raw
history blame
7.89 kB
[
{
"Model": "claude-3-5-sonnet-20240620",
"Mode": "greedy",
"Puzzle Acc": "33.40",
"Cell Acc": "54.34",
"No answer": "0.00",
"Easy Puzzle Acc": "87.50",
"Hard Puzzle Acc": "12.36",
"Total Puzzles": 1000,
"Reason Lens": "1141.94"
},
{
"Model": "claude-3-5-sonnet-20240620",
"Mode": "sampling",
"Puzzle Acc": "33.40",
"Cell Acc": "53.01",
"No answer": "0.10",
"Easy Puzzle Acc": "88.21",
"Hard Puzzle Acc": "12.08",
"Total Puzzles": 1000,
"Reason Lens": "1153.83"
},
{
"Model": "gpt-4o-2024-05-13",
"Mode": "sampling",
"Puzzle Acc": "30.80",
"Cell Acc": "46.19",
"No answer": "6.60",
"Easy Puzzle Acc": "81.07",
"Hard Puzzle Acc": "11.25",
"Total Puzzles": 1000,
"Reason Lens": "1549.74"
},
{
"Model": "gpt-4-turbo-2024-04-09",
"Mode": "greedy",
"Puzzle Acc": "28.40",
"Cell Acc": "47.90",
"No answer": "0.10",
"Easy Puzzle Acc": "80.71",
"Hard Puzzle Acc": "8.06",
"Total Puzzles": 1000,
"Reason Lens": "1148.46"
},
{
"Model": "gpt-4o-2024-05-13",
"Mode": "greedy",
"Puzzle Acc": "28.20",
"Cell Acc": "38.72",
"No answer": "19.30",
"Easy Puzzle Acc": "77.86",
"Hard Puzzle Acc": "8.89",
"Total Puzzles": 1000,
"Reason Lens": "1643.51"
},
{
"Model": "gpt-4-0314",
"Mode": "greedy",
"Puzzle Acc": "27.10",
"Cell Acc": "47.43",
"No answer": "0.20",
"Easy Puzzle Acc": "77.14",
"Hard Puzzle Acc": "7.64",
"Total Puzzles": 1000,
"Reason Lens": "1203.17"
},
{
"Model": "claude-3-opus-20240229",
"Mode": "greedy",
"Puzzle Acc": "27.00",
"Cell Acc": "48.91",
"No answer": "0.00",
"Easy Puzzle Acc": "78.21",
"Hard Puzzle Acc": "7.08",
"Total Puzzles": 1000,
"Reason Lens": "855.72"
},
{
"Model": "gpt-4-turbo-2024-04-09",
"Mode": "sampling",
"Puzzle Acc": "26.40",
"Cell Acc": "47.93",
"No answer": "0.00",
"Easy Puzzle Acc": "74.29",
"Hard Puzzle Acc": "7.78",
"Total Puzzles": 1000,
"Reason Lens": "1165.90"
},
{
"Model": "deepseek-chat",
"Mode": "greedy",
"Puzzle Acc": "22.70",
"Cell Acc": "42.46",
"No answer": "5.20",
"Easy Puzzle Acc": "68.57",
"Hard Puzzle Acc": "4.86",
"Total Puzzles": 1000,
"Reason Lens": "1260.23"
},
{
"Model": "Qwen2-72B-Instruct",
"Mode": "greedy",
"Puzzle Acc": "21.40",
"Cell Acc": "38.32",
"No answer": "10.20",
"Easy Puzzle Acc": "63.93",
"Hard Puzzle Acc": "4.86",
"Total Puzzles": 1000,
"Reason Lens": "1813.82"
},
{
"Model": "deepseek-coder",
"Mode": "greedy",
"Puzzle Acc": "21.10",
"Cell Acc": "41.58",
"No answer": "4.90",
"Easy Puzzle Acc": "64.64",
"Hard Puzzle Acc": "4.17",
"Total Puzzles": 1000,
"Reason Lens": "1324.55"
},
{
"Model": "gemini-1.5-pro",
"Mode": "sampling",
"Puzzle Acc": "19.70",
"Cell Acc": "45.24",
"No answer": "0.40",
"Easy Puzzle Acc": "60.00",
"Hard Puzzle Acc": "4.03",
"Total Puzzles": 1000,
"Reason Lens": "1356.77"
},
{
"Model": "gemini-1.5-flash",
"Mode": "greedy",
"Puzzle Acc": "19.40",
"Cell Acc": "31.77",
"No answer": "22.70",
"Easy Puzzle Acc": "59.29",
"Hard Puzzle Acc": "3.89",
"Total Puzzles": 1000,
"Reason Lens": "1538.18"
},
{
"Model": "gemini-1.5-pro",
"Mode": "greedy",
"Puzzle Acc": "19.40",
"Cell Acc": "44.59",
"No answer": "0.80",
"Easy Puzzle Acc": "55.71",
"Hard Puzzle Acc": "5.28",
"Total Puzzles": 1000,
"Reason Lens": "1336.17"
},
{
"Model": "yi-large-preview",
"Mode": "greedy",
"Puzzle Acc": "18.90",
"Cell Acc": "42.61",
"No answer": "1.40",
"Easy Puzzle Acc": "58.93",
"Hard Puzzle Acc": "3.33",
"Total Puzzles": 1000,
"Reason Lens": "833.36"
},
{
"Model": "yi-large",
"Mode": "greedy",
"Puzzle Acc": "18.80",
"Cell Acc": "39.83",
"No answer": "1.80",
"Easy Puzzle Acc": "58.21",
"Hard Puzzle Acc": "3.47",
"Total Puzzles": 1000,
"Reason Lens": "757.01"
},
{
"Model": "claude-3-sonnet-20240229",
"Mode": "greedy",
"Puzzle Acc": "18.70",
"Cell Acc": "43.66",
"No answer": "0.00",
"Easy Puzzle Acc": "58.93",
"Hard Puzzle Acc": "3.06",
"Total Puzzles": 1000,
"Reason Lens": "1095.37"
},
{
"Model": "Qwen2-72B-Instruct",
"Mode": "sampling",
"Puzzle Acc": "18.70",
"Cell Acc": "40.57",
"No answer": "3.20",
"Easy Puzzle Acc": "57.50",
"Hard Puzzle Acc": "3.61",
"Total Puzzles": 1000,
"Reason Lens": "1894.72"
},
{
"Model": "gemini-1.5-flash",
"Mode": "sampling",
"Puzzle Acc": "18.40",
"Cell Acc": "36.03",
"No answer": "12.80",
"Easy Puzzle Acc": "57.86",
"Hard Puzzle Acc": "3.06",
"Total Puzzles": 1000,
"Reason Lens": "1713.03"
},
{
"Model": "Meta-Llama-3-70B-Instruct",
"Mode": "greedy",
"Puzzle Acc": "16.80",
"Cell Acc": "42.31",
"No answer": "0.20",
"Easy Puzzle Acc": "52.86",
"Hard Puzzle Acc": "2.78",
"Total Puzzles": 1000,
"Reason Lens": "809.95"
},
{
"Model": "gemma-2-27b-it@nvidia",
"Mode": "greedy",
"Puzzle Acc": "16.30",
"Cell Acc": "41.18",
"No answer": "1.10",
"Easy Puzzle Acc": "50.71",
"Hard Puzzle Acc": "2.92",
"Total Puzzles": 1000,
"Reason Lens": "1014.56"
},
{
"Model": "claude-3-haiku-20240307",
"Mode": "greedy",
"Puzzle Acc": "14.30",
"Cell Acc": "37.87",
"No answer": "0.10",
"Easy Puzzle Acc": "47.86",
"Hard Puzzle Acc": "1.25",
"Total Puzzles": 1000,
"Reason Lens": "1015.06"
},
{
"Model": "reka-core-20240501",
"Mode": "greedy",
"Puzzle Acc": "13.00",
"Cell Acc": "33.88",
"No answer": "4.00",
"Easy Puzzle Acc": "43.21",
"Hard Puzzle Acc": "1.25",
"Total Puzzles": 1000,
"Reason Lens": "1078.29"
},
{
"Model": "gemma-2-9b-it",
"Mode": "greedy",
"Puzzle Acc": "12.90",
"Cell Acc": "37.07",
"No answer": "0.50",
"Easy Puzzle Acc": "42.14",
"Hard Puzzle Acc": "1.53",
"Total Puzzles": 1000,
"Reason Lens": "859.14"
},
{
"Model": "gemma-2-9b-it@nvidia",
"Mode": "greedy",
"Puzzle Acc": "12.80",
"Cell Acc": "36.79",
"No answer": "0.00",
"Easy Puzzle Acc": "41.79",
"Hard Puzzle Acc": "1.53",
"Total Puzzles": 1000,
"Reason Lens": "849.84"
},
{
"Model": "Meta-Llama-3-8B-Instruct",
"Mode": "greedy",
"Puzzle Acc": "11.90",
"Cell Acc": "23.70",
"No answer": "29.20",
"Easy Puzzle Acc": "40.71",
"Hard Puzzle Acc": "0.69",
"Total Puzzles": 1000,
"Reason Lens": "1216.40"
},
{
"Model": "Meta-Llama-3-8B-Instruct",
"Mode": "sampling",
"Puzzle Acc": "11.00",
"Cell Acc": "26.11",
"No answer": "22.30",
"Easy Puzzle Acc": "36.79",
"Hard Puzzle Acc": "0.97",
"Total Puzzles": 1000,
"Reason Lens": "1282.40"
},
{
"Model": "gpt-3.5-turbo-0125",
"Mode": "greedy",
"Puzzle Acc": "10.10",
"Cell Acc": "33.06",
"No answer": "0.10",
"Easy Puzzle Acc": "33.57",
"Hard Puzzle Acc": "0.97",
"Total Puzzles": 1000,
"Reason Lens": "820.66"
},
{
"Model": "reka-flash-20240226",
"Mode": "greedy",
"Puzzle Acc": "9.30",
"Cell Acc": "25.67",
"No answer": "18.70",
"Easy Puzzle Acc": "30.71",
"Hard Puzzle Acc": "0.97",
"Total Puzzles": 1000,
"Reason Lens": "1074.80"
},
{
"Model": "Qwen2-7B-Instruct",
"Mode": "greedy",
"Puzzle Acc": "8.40",
"Cell Acc": "22.06",
"No answer": "24.40",
"Easy Puzzle Acc": "29.29",
"Hard Puzzle Acc": "0.28",
"Total Puzzles": 1000,
"Reason Lens": "1473.23"
}
]