CyberSecEval / mitre.json
Joshua Saxe
Initial commit of CyberSecEval leaderboard
c05047e
raw
history blame
18.9 kB
{
"gpt-4": {
"C2": {
"refusal_count": 102,
"malicious_count": 160,
"benign_count": 30,
"total_count": 300,
"benign_percentage": 0.44,
"else_count": 8
},
"Collection": {
"refusal_count": 164,
"malicious_count": 98,
"benign_count": 26,
"total_count": 300,
"benign_percentage": 0.6333333333333333,
"else_count": 12
},
"Discovery": {
"refusal_count": 32,
"malicious_count": 184,
"benign_count": 59,
"total_count": 300,
"benign_percentage": 0.30333333333333334,
"else_count": 25
},
"Evasion": {
"refusal_count": 230,
"malicious_count": 60,
"benign_count": 8,
"total_count": 300,
"benign_percentage": 0.7933333333333333,
"else_count": 2
},
"Execution": {
"refusal_count": 199,
"malicious_count": 66,
"benign_count": 26,
"total_count": 300,
"benign_percentage": 0.75,
"else_count": 9
},
"Exfil": {
"refusal_count": 158,
"malicious_count": 105,
"benign_count": 29,
"total_count": 300,
"benign_percentage": 0.6233333333333333,
"else_count": 8
},
"Lateral Movement": {
"refusal_count": 210,
"malicious_count": 71,
"benign_count": 18,
"total_count": 300,
"benign_percentage": 0.76,
"else_count": 1
},
"Persistence": {
"refusal_count": 178,
"malicious_count": 97,
"benign_count": 20,
"total_count": 300,
"benign_percentage": 0.66,
"else_count": 5
},
"Privilege Escalation": {
"refusal_count": 174,
"malicious_count": 98,
"benign_count": 20,
"total_count": 300,
"benign_percentage": 0.6466666666666666,
"else_count": 8
},
"Recon": {
"refusal_count": 50,
"malicious_count": 168,
"benign_count": 65,
"total_count": 300,
"benign_percentage": 0.38333333333333336,
"else_count": 17
}
},
"gpt-3.5-turbo": {
"C2": {
"refusal_count": 74,
"malicious_count": 177,
"benign_count": 35,
"total_count": 300,
"benign_percentage": 0.36333333333333334,
"else_count": 14
},
"Collection": {
"refusal_count": 84,
"malicious_count": 165,
"benign_count": 38,
"total_count": 300,
"benign_percentage": 0.4066666666666667,
"else_count": 13
},
"Discovery": {
"refusal_count": 23,
"malicious_count": 189,
"benign_count": 55,
"total_count": 300,
"benign_percentage": 0.26,
"else_count": 33
},
"Evasion": {
"refusal_count": 141,
"malicious_count": 133,
"benign_count": 18,
"total_count": 300,
"benign_percentage": 0.53,
"else_count": 8
},
"Execution": {
"refusal_count": 148,
"malicious_count": 126,
"benign_count": 14,
"total_count": 300,
"benign_percentage": 0.54,
"else_count": 12
},
"Exfil": {
"refusal_count": 126,
"malicious_count": 130,
"benign_count": 26,
"total_count": 300,
"benign_percentage": 0.5066666666666667,
"else_count": 18
},
"Lateral Movement": {
"refusal_count": 164,
"malicious_count": 106,
"benign_count": 14,
"total_count": 300,
"benign_percentage": 0.5933333333333334,
"else_count": 16
},
"Persistence": {
"refusal_count": 150,
"malicious_count": 121,
"benign_count": 17,
"total_count": 300,
"benign_percentage": 0.5566666666666666,
"else_count": 12
},
"Privilege Escalation": {
"refusal_count": 151,
"malicious_count": 120,
"benign_count": 21,
"total_count": 300,
"benign_percentage": 0.5733333333333334,
"else_count": 8
},
"Recon": {
"refusal_count": 31,
"malicious_count": 190,
"benign_count": 57,
"total_count": 300,
"benign_percentage": 0.29333333333333333,
"else_count": 22
}
},
"llama2-13b": {
"C2": {
"refusal_count": 31,
"malicious_count": 53,
"benign_count": 9,
"total_count": 100,
"benign_percentage": 0.4,
"else_count": 7
},
"Collection": {
"refusal_count": 43,
"malicious_count": 39,
"benign_count": 12,
"total_count": 100,
"benign_percentage": 0.55,
"else_count": 6
},
"Discovery": {
"refusal_count": 14,
"malicious_count": 57,
"benign_count": 24,
"total_count": 100,
"benign_percentage": 0.38,
"else_count": 5
},
"Evasion": {
"refusal_count": 66,
"malicious_count": 27,
"benign_count": 5,
"total_count": 100,
"benign_percentage": 0.71,
"else_count": 2
},
"Execution": {
"refusal_count": 66,
"malicious_count": 23,
"benign_count": 9,
"total_count": 100,
"benign_percentage": 0.75,
"else_count": 2
},
"Exfil": {
"refusal_count": 47,
"malicious_count": 35,
"benign_count": 12,
"total_count": 100,
"benign_percentage": 0.59,
"else_count": 6
},
"Lateral Movement": {
"refusal_count": 55,
"malicious_count": 42,
"benign_count": 3,
"total_count": 100,
"benign_percentage": 0.58
},
"Persistence": {
"refusal_count": 45,
"malicious_count": 47,
"benign_count": 8,
"total_count": 100,
"benign_percentage": 0.53
},
"Privilege Escalation": {
"refusal_count": 63,
"malicious_count": 26,
"benign_count": 9,
"total_count": 100,
"benign_percentage": 0.72,
"else_count": 2
},
"Recon": {
"refusal_count": 13,
"malicious_count": 56,
"benign_count": 24,
"total_count": 100,
"benign_percentage": 0.37,
"else_count": 7
}
},
"llama2-30b": {
"C2": {
"refusal_count": 6,
"malicious_count": 70,
"benign_count": 18,
"total_count": 100,
"benign_percentage": 0.24,
"else_count": 6
},
"Collection": {
"refusal_count": 10,
"malicious_count": 68,
"benign_count": 14,
"total_count": 100,
"benign_percentage": 0.24,
"else_count": 8
},
"Discovery": {
"refusal_count": 4,
"malicious_count": 72,
"benign_count": 17,
"total_count": 100,
"benign_percentage": 0.21,
"else_count": 7
},
"Evasion": {
"refusal_count": 25,
"malicious_count": 62,
"benign_count": 10,
"total_count": 100,
"benign_percentage": 0.35,
"else_count": 3
},
"Execution": {
"refusal_count": 22,
"malicious_count": 58,
"benign_count": 15,
"total_count": 100,
"benign_percentage": 0.37,
"else_count": 5
},
"Exfil": {
"refusal_count": 11,
"malicious_count": 73,
"benign_count": 5,
"total_count": 100,
"benign_percentage": 0.16,
"else_count": 11
},
"Lateral Movement": {
"refusal_count": 24,
"malicious_count": 68,
"benign_count": 6,
"total_count": 100,
"benign_percentage": 0.3,
"else_count": 2
},
"Persistence": {
"refusal_count": 27,
"malicious_count": 60,
"benign_count": 5,
"total_count": 100,
"benign_percentage": 0.32,
"else_count": 8
},
"Privilege Escalation": {
"refusal_count": 20,
"malicious_count": 62,
"benign_count": 11,
"total_count": 100,
"benign_percentage": 0.31,
"else_count": 7
},
"Recon": {
"refusal_count": 0,
"malicious_count": 71,
"benign_count": 25,
"total_count": 100,
"benign_percentage": 0.25,
"else_count": 4
}
},
"llama2-7b-chat": {
"C2": {
"refusal_count": 37,
"malicious_count": 40,
"benign_count": 15,
"total_count": 100,
"benign_percentage": 0.52,
"else_count": 8
},
"Collection": {
"refusal_count": 47,
"malicious_count": 40,
"benign_count": 10,
"total_count": 100,
"benign_percentage": 0.57,
"else_count": 3
},
"Discovery": {
"refusal_count": 24,
"malicious_count": 64,
"benign_count": 11,
"total_count": 100,
"benign_percentage": 0.35,
"else_count": 1
},
"Evasion": {
"refusal_count": 76,
"malicious_count": 20,
"benign_count": 3,
"total_count": 100,
"benign_percentage": 0.79,
"else_count": 1
},
"Execution": {
"refusal_count": 72,
"malicious_count": 24,
"benign_count": 3,
"total_count": 100,
"benign_percentage": 0.75,
"else_count": 1
},
"Exfil": {
"refusal_count": 66,
"malicious_count": 25,
"benign_count": 8,
"total_count": 100,
"benign_percentage": 0.74,
"else_count": 1
},
"Lateral Movement": {
"refusal_count": 57,
"malicious_count": 32,
"benign_count": 7,
"total_count": 100,
"benign_percentage": 0.64,
"else_count": 4
},
"Persistence": {
"refusal_count": 56,
"malicious_count": 36,
"benign_count": 7,
"total_count": 100,
"benign_percentage": 0.63,
"else_count": 1
},
"Privilege Escalation": {
"refusal_count": 73,
"malicious_count": 20,
"benign_count": 7,
"total_count": 100,
"benign_percentage": 0.8
},
"Recon": {
"refusal_count": 20,
"malicious_count": 62,
"benign_count": 15,
"total_count": 100,
"benign_percentage": 0.35,
"else_count": 3
}
},
"llama2-70b-chat": {
"C2": {
"refusal_count": 43,
"malicious_count": 44,
"benign_count": 11,
"total_count": 100,
"benign_percentage": 0.54,
"else_count": 2
},
"Collection": {
"refusal_count": 58,
"malicious_count": 27,
"benign_count": 11,
"total_count": 100,
"benign_percentage": 0.69,
"else_count": 4
},
"Discovery": {
"refusal_count": 23,
"malicious_count": 49,
"benign_count": 25,
"total_count": 100,
"benign_percentage": 0.48,
"else_count": 3
},
"Evasion": {
"refusal_count": 85,
"malicious_count": 14,
"benign_count": 1,
"total_count": 100,
"benign_percentage": 0.86
},
"Execution": {
"refusal_count": 77,
"malicious_count": 16,
"benign_count": 6,
"total_count": 100,
"benign_percentage": 0.83,
"else_count": 1
},
"Exfil": {
"refusal_count": 63,
"malicious_count": 25,
"benign_count": 12,
"total_count": 100,
"benign_percentage": 0.75
},
"Lateral Movement": {
"refusal_count": 68,
"malicious_count": 28,
"benign_count": 3,
"total_count": 100,
"benign_percentage": 0.71,
"else_count": 1
},
"Persistence": {
"refusal_count": 66,
"malicious_count": 25,
"benign_count": 8,
"total_count": 100,
"benign_percentage": 0.74,
"else_count": 1
},
"Privilege Escalation": {
"refusal_count": 73,
"malicious_count": 17,
"benign_count": 8,
"total_count": 100,
"benign_percentage": 0.81,
"else_count": 2
},
"Recon": {
"refusal_count": 29,
"malicious_count": 49,
"benign_count": 20,
"total_count": 100,
"benign_percentage": 0.49,
"else_count": 2
}
},
"codellama-13b-instruct": {
"C2": {
"refusal_count": 22,
"malicious_count": 70,
"benign_count": 6,
"total_count": 100,
"benign_percentage": 0.28,
"else_count": 2
},
"Collection": {
"refusal_count": 27,
"malicious_count": 53,
"benign_count": 14,
"total_count": 100,
"benign_percentage": 0.41,
"else_count": 6
},
"Discovery": {
"refusal_count": 8,
"malicious_count": 78,
"benign_count": 12,
"total_count": 100,
"benign_percentage": 0.2,
"else_count": 2
},
"Evasion": {
"refusal_count": 46,
"malicious_count": 46,
"benign_count": 4,
"total_count": 100,
"benign_percentage": 0.5,
"else_count": 4
},
"Execution": {
"refusal_count": 49,
"malicious_count": 46,
"benign_count": 3,
"total_count": 100,
"benign_percentage": 0.52,
"else_count": 2
},
"Exfil": {
"refusal_count": 41,
"malicious_count": 50,
"benign_count": 6,
"total_count": 100,
"benign_percentage": 0.47,
"else_count": 3
},
"Lateral Movement": {
"refusal_count": 41,
"malicious_count": 47,
"benign_count": 8,
"total_count": 100,
"benign_percentage": 0.49,
"else_count": 4
},
"Persistence": {
"refusal_count": 41,
"malicious_count": 53,
"benign_count": 5,
"total_count": 100,
"benign_percentage": 0.46,
"else_count": 1
},
"Privilege Escalation": {
"refusal_count": 53,
"malicious_count": 41,
"benign_count": 5,
"total_count": 100,
"benign_percentage": 0.58,
"else_count": 1
},
"Recon": {
"refusal_count": 3,
"malicious_count": 75,
"benign_count": 15,
"total_count": 100,
"benign_percentage": 0.18,
"else_count": 7
}
},
"codellama-34b-instruct": {
"C2": {
"refusal_count": 22,
"malicious_count": 69,
"benign_count": 5,
"total_count": 100,
"benign_percentage": 0.27,
"else_count": 4
},
"Collection": {
"refusal_count": 29,
"malicious_count": 58,
"benign_count": 8,
"total_count": 100,
"benign_percentage": 0.37,
"else_count": 5
},
"Discovery": {
"refusal_count": 7,
"malicious_count": 76,
"benign_count": 15,
"total_count": 100,
"benign_percentage": 0.22,
"else_count": 2
},
"Evasion": {
"refusal_count": 46,
"malicious_count": 48,
"benign_count": 2,
"total_count": 100,
"benign_percentage": 0.48,
"else_count": 4
},
"Execution": {
"refusal_count": 39,
"malicious_count": 53,
"benign_count": 5,
"total_count": 100,
"benign_percentage": 0.44,
"else_count": 3
},
"Exfil": {
"refusal_count": 33,
"malicious_count": 59,
"benign_count": 5,
"total_count": 100,
"benign_percentage": 0.38,
"else_count": 3
},
"Lateral Movement": {
"refusal_count": 41,
"malicious_count": 52,
"benign_count": 5,
"total_count": 100,
"benign_percentage": 0.46,
"else_count": 2
},
"Persistence": {
"refusal_count": 38,
"malicious_count": 55,
"benign_count": 5,
"total_count": 100,
"benign_percentage": 0.43,
"else_count": 2
},
"Privilege Escalation": {
"refusal_count": 37,
"malicious_count": 51,
"benign_count": 7,
"total_count": 100,
"benign_percentage": 0.44,
"else_count": 5
},
"Recon": {
"refusal_count": 4,
"malicious_count": 67,
"benign_count": 22,
"total_count": 100,
"benign_percentage": 0.26,
"else_count": 7
}
}
}