Spaces:

mrm8488
/

test_dev_mode

Sleeping

test_dev_mode / app.py

Feat: add task synonyms

e706e8c about 2 months ago

2.9 kB

	import gradio as gr

	# Synonyms for each task category
	task_synonyms = {
	"Undergraduate level knowledge": ["undergraduate level knowledge", "MMLU"],
	"Graduate level reasoning": ["graduate level reasoning", "GPOA", "Diamond"],
	"Grade school math": ["grade school math", "GSM8K"],
	"Math problem-solving": ["math problem-solving", "MATH"],
	"Multilingual math": ["multilingual math", "MGSM"],
	"Code": ["code", "coding", "programming", "HumanEval"],
	"Reasoning over text": ["reasoning over text", "DROP", "F1 score"],
	"Mixed evaluations": ["mixed evaluations", "BIG-Bench-Hard"],
	"Knowledge Q&A": ["knowledge Q&A", "ARC-Challenge"],
	"Common Knowledge": ["common knowledge", "HellaSwag"],
	}

	# LLM performance data with scores
	performance_data = {
	"Undergraduate level knowledge": [("Claude 3 Opus", 86.8), ("GPT-4", 86.4), ("Gemini 1.0 Ultra", 83.7)],
	"Graduate level reasoning": [("Claude 3 Opus", 50.4), ("Claude 3 Sonnet", 40.4), ("GPT-4", 35.7)],
	"Grade school math": [("Claude 3 Opus", 95.0), ("Gemini 1.0 Ultra", 94.4), ("GPT-4", 92.0)],
	"Math problem-solving": [("Claude 3 Opus", 60.1), ("Gemini 1.0 Ultra", 53.2), ("GPT-4", 52.9)],
	"Multilingual math": [("Claude 3 Opus", 90.7), ("Claude 3 Sonnet", 83.5), ("Gemini 1.0 Ultra", 79.0)],
	"Code": [("Claude 3 Opus", 84.9), ("Gemini 1.0 Ultra", 74.4), ("Claude 3 Haiku", 75.9)],
	"Reasoning over text": [("Claude 3 Opus", 83.1), ("Gemini 1.0 Ultra", 82.4), ("GPT-4", 80.9)],
	"Mixed evaluations": [("Claude 3 Opus", 86.8), ("Gemini 1.0 Ultra", 83.6), ("GPT-4", 83.1)],
	"Knowledge Q&A": [("Claude 3 Opus", 96.4), ("GPT-4", 96.3), ("Claude 3 Sonnet", 93.2)],
	"Common Knowledge": [("Claude 3 Opus", 95.4), ("GPT-4", 95.3), ("Gemini 1.0 Ultra", 87.8)],
	}

	def recommend_llm(task):
	# Normalize the input task to match against synonyms
	task_lower = task.lower()
	main_category = None
	for key, synonyms in task_synonyms.items():
	if task_lower in map(str.lower, synonyms):
	main_category = key
	break

	if not main_category:
	return "No data available"

	recommendations = performance_data.get(main_category, [])
	recommendations_sorted = sorted(recommendations, key=lambda x: x[1], reverse=True)
	result = f"For {task}, the recommended LLMs are:\n"
	for i, (model, score) in enumerate(recommendations_sorted):
	result += f"{i+1}. {model} with a score of {score}%\n"
	return result

	# Gradio interface
	interface = gr.Interface(
	fn=recommend_llm,
	inputs=gr.Textbox(label="Enter Task"),
	outputs=gr.Textbox(label="LLM Recommendations"),
	title="LLM Recommendation App",
	description="Enter a task to get recommendations for the best LLMs based on performance data. For example, you can enter 'coding', 'undergraduate level knowledge', etc."
	)

	# Launch the app
	interface.launch()