Spaces:

livecodebench
/

code_generation_samples

Running

File size: 3,605 Bytes

import os
import random
import glob
import json

import numpy as np
from flask import Flask, render_template, request

app = Flask(__name__)


with open("problems.json") as f:
    problems = json.load(f)
    problem_choices = [q["question_title"] for q in problems]

random_idxs = list(range(len(problems)))
random.shuffle(random_idxs)
problems = [problems[idx] for idx in random_idxs]

with open("all_outputs.json") as f:
    all_outputs = json.load(f)
    all_models = list(all_outputs.keys())


num_questions_filtered = len(problems)

all_correctness_by_problem = {
    idx: {model: np.mean(all_outputs[model][idx]["pass1_list"]) for model in all_models}
    for idx in random_idxs
}


def calculate_color(performance):
    # Convert performance to a value between 0 and 1
    # Calculate the red and green components of the color
    if performance > 0.75:
        return f"rgba(0, 150, 0, 0.5)"
    elif performance > 0.5:
        return f"rgba(50, 150, 0, {performance})"
    elif performance > 0.25:
        return f"rgba(150, 50, 0, {1-performance})"
    else:
        return f"rgba(150, 0, 0,  0.5)"


all_evaluations_by_problem_colored = [
    (
        trueidx,
        {
            model: {
                "correctness": f"{all_correctness_by_problem[idx][model]*100:.1f}",
                "correctness_color": calculate_color(
                    all_correctness_by_problem[idx][model]
                ),
            }
            for model in all_models
        },
        problems[idx]["difficulty"],
    )
    for trueidx, idx in enumerate(random_idxs)
]

all_data_for_view_formatted = {
    model: [
        [{"code": a, "pass1": b} for a, b in zip(row["code_list"], row["pass1_list"])]
        # print(row)
        for idx in random_idxs
        for row in [resp[idx]]
    ]
    for model, resp in all_outputs.items()
}


@app.route("/")
def home():
    # Fetch your data here
    print(all_models)
    return render_template(
        "index.html", models=all_models, problems=all_evaluations_by_problem_colored
    )


@app.route("/problem/<int:problem_idx>")
def problem(problem_idx):
    # Fetch your data here

    data = {
        model: all_data_for_view_formatted[model][problem_idx] for model in all_models
    }
    evaluation = all_evaluations_by_problem_colored[problem_idx][1]
    question = problems[problem_idx]

    # print(data)

    return render_template(
        "problem.html",
        problem_idx=problem_idx,
        evaluation=evaluation,
        models=all_models,
        question=question,
        data=data,
    )


mini_models = [
    "DeepSeek-V2",
    "DeepSeekCoder-V2",
    "DSCoder-33b-Ins",
    "LLama3-70b-Ins",
    "GPT-4-Turbo-2024-04-09",
    "GPT-4O-2024-05-13",
    "Claude-3-Opus",
    "Claude-3-Sonnet",
    "Gemini-Pro-1.5-May",
]


@app.route("/mini")
def mini():
    # Fetch your data here
    return render_template(
        "index_mini.html",
        models=mini_models,
        problems=all_evaluations_by_problem_colored,
    )


@app.route("/problem_mini/<int:problem_idx>")
def problem_mini(problem_idx):
    # Fetch your data here

    data = {
        model: all_data_for_view_formatted[model][problem_idx] for model in mini_models
    }
    evaluation = all_evaluations_by_problem_colored[problem_idx][1]
    question = problems[problem_idx]

    # print(data)

    return render_template(
        "problem_mini.html",
        problem_idx=problem_idx,
        evaluation=evaluation,
        models=mini_models,
        question=question,
        data=data,
    )


if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860)