open_pt_llm_leaderboard / external_models_results.json
eduagarcia's picture
fix gemini results
580b92c
[
{
"model": "sabia-2-small",
"name": "Sabiá-2 Small",
"link": "https://www.maritaca.ai/",
"date": "2024-04-12",
"status": "full",
"main_language": "Portuguese",
"model_type": "proprietary",
"result_metrics": {
"enem_challenge": 0.7172848145556333,
"bluex": 0.5549374130737135,
"oab_exams": 0.6364464692482916,
"assin2_sts": 0.7053302344881672,
"assin2_rte": 0.9121728362223306,
"faquad_nli": 0.7575848453041435,
"hatebr_offensive": 0.753800795680591,
"portuguese_hate_speech": 0.6975326368290793,
"tweetsentbr": 0.7119699374276466
},
"result_metrics_average": 0.7163399980921773,
"result_metrics_npm": 0.5744541501392351
},
{
"model": "sabia-2-medium",
"name": "Sabiá-2 Medium",
"link": "https://www.maritaca.ai/",
"date": "2024-04-13",
"status": "full",
"main_language": "Portuguese",
"model_type": "proprietary",
"result_metrics": {
"enem_challenge": 0.8180545836249126,
"bluex": 0.717663421418637,
"oab_exams": 0.7321184510250569,
"assin2_sts": 0.7804108376537757,
"assin2_rte": 0.923459363368553,
"faquad_nli": 0.7657657657657658,
"hatebr_offensive": 0.8349989882997386,
"portuguese_hate_speech": 0.7379326358571694,
"tweetsentbr": 0.7269533040381798
},
"result_metrics_average": 0.7819285945613098,
"result_metrics_npm": 0.6676121786922709
},
{
"model": "gpt-3.5-turbo-0125",
"name": "GPT-3.5 Turbo (0125)",
"link": "https://www.openai.com/",
"date": "2024-03-08",
"status": "full",
"main_language": "English",
"model_type": "proprietary",
"result_metrics": {
"enem_challenge": 0.7214835549335199,
"bluex": 0.6244784422809457,
"oab_exams": 0.5430523917995445,
"assin2_sts": 0.7378460201077941,
"assin2_rte": 0.8823038414050672,
"faquad_nli": 0.746353108609074,
"hatebr_offensive": 0.8056205941193919,
"portuguese_hate_speech": 0.7363692688971499,
"tweetsentbr": 0.7028981330613626
},
"result_metrics_average": 0.7222672616904278,
"result_metrics_npm": 0.5841504766165372
},
{
"model": "claude-3-haiku-20240307",
"name": "Claude-3 Haiku (20240307)",
"link": "https://www.claude.ai/",
"date": "2024-04-13",
"status": "full",
"main_language": "English",
"model_type": "proprietary",
"result_metrics": {
"enem_challenge": 0.7718684394681595,
"bluex": 0.6662030598052852,
"oab_exams": 0.626879271070615,
"assin2_sts": 0.7892124744168747,
"assin2_rte": 0.9184462138121732,
"faquad_nli": 0.6340996599941455,
"hatebr_offensive": 0.8023698759439051,
"portuguese_hate_speech": 0.7342166269560177,
"tweetsentbr": 0.7303315733000207
},
"result_metrics_average": 0.7415141327519107,
"result_metrics_npm": 0.6037151240886439
},
{
"model": "gemini-1.0-pro",
"name": "Gemini 1.0 Pro",
"link": "https://ai.google.dev/",
"date": "2024-03-08",
"status": "full",
"main_language": "English",
"model_type": "proprietary",
"result_metrics": {
"enem_challenge": 0.7130860741777467,
"bluex": 0.5869262865090403,
"oab_exams": 0.4988610478359909,
"assin2_sts": 0.7058831239763663,
"assin2_rte": 0.8945993304651698,
"faquad_nli": 0.7070913567220611,
"hatebr_offensive": 0.8086330094493972,
"portuguese_hate_speech": 0.699119105113102,
"tweetsentbr": 0.6803240476660983
},
"result_metrics_average": 0.6993914868794414,
"result_metrics_npm": 0.551208000273598
},
{
"model": "gemini-1.5-pro-preview-0409",
"name": "Gemini 1.5 Pro Preview (0409)",
"link": "https://cloud.google.com/vertex-ai",
"date": "2024-04-15",
"status": "full",
"main_language": "English",
"model_type": "proprietary",
"result_metrics": {
"enem_challenge": 0.8509447165850245,
"bluex": 0.7719054242002782,
"oab_exams": 0.6888382687927107,
"assin2_sts": 0.8159702278408203,
"assin2_rte": 0.9328989988467518,
"faquad_nli": 0.7290756302521009,
"hatebr_offensive": 0.8697698647467024,
"portuguese_hate_speech": 0.7539414414414414,
"tweetsentbr": 0.772785080895884
},
"result_metrics_average": 0.7984588504001905,
"result_metrics_npm": 0.6908188311933006
},
{
"model": "deepseek-v2-chat",
"name": "DeepSeek-V2 Chat (API)",
"link": "https://www.deepseek.com/",
"date": "2024-05-18",
"status": "full",
"main_language": "English",
"model_type": "proprietary",
"result_metrics": {
"enem_challenge": 0.7844646606018194,
"bluex": 0.6954102920723226,
"oab_exams": 0.564009111617312,
"assin2_sts": 0.8533174657651231,
"assin2_rte": 0.9440170304568147,
"faquad_nli": 0.7995469048381548,
"hatebr_offensive": 0.8842986491071644,
"portuguese_hate_speech": 0.7271736342651962,
"tweetsentbr": 0.6835304759163984
},
"result_metrics_average": 0.7706409138489229,
"result_metrics_npm": 0.655901521190756
},
{
"model": "gemini-1.5-flash-preview-0514",
"name": "Gemini 1.5 Flash Preview (0514)",
"link": "https://cloud.google.com/vertex-ai",
"date": "2024-05-18",
"status": "full",
"main_language": "English",
"model_type": "proprietary",
"result_metrics": {
"enem_challenge": 0.8264520643806857,
"bluex": 0.7482614742698191,
"oab_exams": 0.6419134396355353,
"assin2_sts": 0.841655158151231,
"assin2_rte": 0.9362097477374545,
"faquad_nli": 0.8092185592185592,
"hatebr_offensive": 0.9099110141445836,
"portuguese_hate_speech": 0.6875904275305673,
"tweetsentbr": 0.7219800292667018
},
"result_metrics_average": 0.7914657682594597,
"result_metrics_npm": 0.6834036936130392
},
{
"model": "gemini-1.5-flash",
"name": "Gemini 1.5 Flash",
"link": "https://cloud.google.com/vertex-ai",
"date": "2024-08-09",
"status": "full",
"main_language": "English",
"model_type": "proprietary",
"result_metrics": {
"enem_challenge": 0.8306508047585724,
"bluex": 0.7579972183588317,
"oab_exams": 0.6446469248291572,
"assin2_sts": 0.838806085610371,
"assin2_rte": 0.9366169973822607,
"faquad_nli": 0.7963910785668922,
"hatebr_offensive": 0.9092078461170015,
"portuguese_hate_speech": 0.6932563987219857,
"tweetsentbr": 0.7312948963367732
},
"result_metrics_average": 0.7932075834090939,
"result_metrics_npm": 0.6855338135928848
},
{
"model": "gpt-4o-mini-2024-07-18",
"name": "GPT 4o Mini (2024-07-18)",
"link": "https://www.openai.com/",
"date": "2024-07-25",
"status": "full",
"main_language": "English",
"model_type": "proprietary",
"result_metrics": {
"enem_challenge": 0.7669699090272918,
"bluex": 0.6842837273991655,
"oab_exams": 0.6013667425968109,
"assin2_sts": 0.7259038954527597,
"assin2_rte": 0.942809846745341,
"faquad_nli": 0.819807735300693,
"hatebr_offensive": 0.8682357029532165,
"portuguese_hate_speech": 0.7501413502853012,
"tweetsentbr": 0.7509303825869922
},
"result_metrics_average": 0.7678276991497301,
"result_metrics_npm": 0.6595966999910003
},
{
"model": "nemotron-4-340b-instruct",
"name": "nvidia/Nemotron-4-340B-Instruct (Nvidia API)",
"link": "https://build.nvidia.com/nvidia/nemotron-4-340b-instruct",
"date": "2024-06-30",
"status": "full",
"main_language": "English",
"model_type": "chat",
"params": 340.0,
"result_metrics": {
"enem_challenge": 0.6648005598320503,
"bluex": 0.6578581363004172,
"oab_exams": 0.7020501138952164,
"assin2_sts": 0.7857731021403329,
"assin2_rte": 0.9489354458928496,
"faquad_nli": 0.8194444444444444,
"hatebr_offensive": 0.8641580001234928,
"portuguese_hate_speech": 0.7761835184102864,
"tweetsentbr": 0.780880021326841
},
"result_metrics_average": 0.7777870380406591,
"result_metrics_npm": 0.6740728488043128
},
{
"model": "llama_405b_instruct",
"name": "meta-llama/Meta-Llama-3.1-405B-Instruct (Vertex AI)",
"link": "https://cloud.google.com/vertex-ai",
"date": "2024-08-20",
"status": "full",
"main_language": "English",
"model_type": "chat",
"params": 406.0,
"result_metrics": {
"enem_challenge": 0.8523442967109867,
"bluex": 0.8011126564673157,
"oab_exams": 0.7640091116173121,
"assin2_sts": 0.7888441732870783,
"assin2_rte": 0.9476445477916471,
"faquad_nli": 0.825063276593557,
"hatebr_offensive": 0.9073940659389119,
"portuguese_hate_speech": 0.7191480935512969,
"tweetsentbr": 0.7821434639106575
},
"result_metrics_average": 0.8208559650965292,
"result_metrics_npm": 0.7286932366792048
},
{
"model": "sabia-3",
"name": "Sabiá-3",
"link": "https://www.maritaca.ai/",
"date": "2024-08-20",
"status": "full",
"main_language": "Portuguese",
"model_type": "proprietary",
"result_metrics": {
"enem_challenge": 0.8789363191042687,
"bluex": 0.7899860917941586,
"oab_exams": 0.8391799544419134,
"assin2_sts": 0.8253863689009022,
"assin2_rte": 0.9477034821619312,
"faquad_nli": 0.8243848812618203,
"hatebr_offensive": 0.8278737774590023,
"portuguese_hate_speech": 0.7241071428571428,
"tweetsentbr": 0.7510613086648664
},
"result_metrics_average": 0.8231799251828895,
"result_metrics_npm": 0.7241097388486535
}
]