|
import os |
|
import random |
|
import glob |
|
import json |
|
|
|
import numpy as np |
|
from flask import Flask, render_template, request |
|
|
|
app = Flask(__name__) |
|
|
|
|
|
with open("problems.json") as f: |
|
problems = json.load(f) |
|
problem_choices = [q["question_title"] for q in problems] |
|
|
|
random_idxs = list(range(len(problems))) |
|
random.shuffle(random_idxs) |
|
problems = [problems[idx] for idx in random_idxs] |
|
|
|
with open("all_outputs.json") as f: |
|
all_outputs = json.load(f) |
|
all_models = list(all_outputs.keys()) |
|
|
|
|
|
num_questions_filtered = len(problems) |
|
|
|
all_correctness_by_problem = { |
|
idx: {model: np.mean(all_outputs[model][idx]["pass1_list"]) for model in all_models} |
|
for idx in random_idxs |
|
} |
|
|
|
|
|
def calculate_color(performance): |
|
|
|
|
|
if performance > 0.75: |
|
return f"rgba(0, 150, 0, 0.5)" |
|
elif performance > 0.5: |
|
return f"rgba(50, 150, 0, {performance})" |
|
elif performance > 0.25: |
|
return f"rgba(150, 50, 0, {1-performance})" |
|
else: |
|
return f"rgba(150, 0, 0, 0.5)" |
|
|
|
|
|
all_evaluations_by_problem_colored = [ |
|
( |
|
trueidx, |
|
{ |
|
model: { |
|
"correctness": f"{all_correctness_by_problem[idx][model]*100:.1f}", |
|
"correctness_color": calculate_color( |
|
all_correctness_by_problem[idx][model] |
|
), |
|
} |
|
for model in all_models |
|
}, |
|
problems[idx]["difficulty"], |
|
) |
|
for trueidx, idx in enumerate(random_idxs) |
|
] |
|
|
|
all_data_for_view_formatted = { |
|
model: [ |
|
[{"code": a, "pass1": b} for a, b in zip(row["code_list"], row["pass1_list"])] |
|
|
|
for idx in random_idxs |
|
for row in [resp[idx]] |
|
] |
|
for model, resp in all_outputs.items() |
|
} |
|
|
|
|
|
@app.route("/") |
|
def home(): |
|
|
|
print(all_models) |
|
return render_template( |
|
"index.html", models=all_models, problems=all_evaluations_by_problem_colored |
|
) |
|
|
|
|
|
@app.route("/problem/<int:problem_idx>") |
|
def problem(problem_idx): |
|
|
|
|
|
data = { |
|
model: all_data_for_view_formatted[model][problem_idx] for model in all_models |
|
} |
|
evaluation = all_evaluations_by_problem_colored[problem_idx][1] |
|
question = problems[problem_idx] |
|
|
|
|
|
|
|
return render_template( |
|
"problem.html", |
|
problem_idx=problem_idx, |
|
evaluation=evaluation, |
|
models=all_models, |
|
question=question, |
|
data=data, |
|
) |
|
|
|
|
|
mini_models = [ |
|
|
|
"DeepSeekCoder-V2", |
|
|
|
"LLama3.1-70b-Ins", |
|
"LLama3.1-405b-Ins-FP8", |
|
|
|
"GPT-4O-2024-05-13", |
|
"Claude-3-Opus", |
|
|
|
"Gemini-Pro-1.5-August", |
|
"O1-Mini (N=1)", |
|
"O1-Preview (N=1)", |
|
] |
|
|
|
|
|
@app.route("/mini") |
|
def mini(): |
|
|
|
return render_template( |
|
"index_mini.html", |
|
models=mini_models, |
|
problems=all_evaluations_by_problem_colored, |
|
) |
|
|
|
|
|
@app.route("/problem_mini/<int:problem_idx>") |
|
def problem_mini(problem_idx): |
|
|
|
|
|
data = { |
|
model: all_data_for_view_formatted[model][problem_idx] for model in mini_models |
|
} |
|
evaluation = all_evaluations_by_problem_colored[problem_idx][1] |
|
question = problems[problem_idx] |
|
|
|
|
|
|
|
return render_template( |
|
"problem_mini.html", |
|
problem_idx=problem_idx, |
|
evaluation=evaluation, |
|
models=mini_models, |
|
question=question, |
|
data=data, |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
app.run(host="0.0.0.0", port=7860) |
|
|