|
from datasets import load_dataset |
|
|
|
MODEL = 'cmg_gpt_4_0613' |
|
CACHE_DIR = 'cache' |
|
|
|
|
|
def load_data(): |
|
dataset = load_dataset("JetBrains-Research/lca-cmg", |
|
"commitchronicle-py-long", |
|
split="test", |
|
cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo']).rename( |
|
columns={'message': 'reference'}) |
|
|
|
model_dataset = load_dataset("JetBrains-Research/lca-results", |
|
MODEL, |
|
split="test", |
|
cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo'])[["prediction"]] |
|
|
|
model_dataset = model_dataset[~model_dataset.index.duplicated(keep='first')] |
|
dataset = dataset.join(other=model_dataset) |
|
|
|
return dataset.reset_index().to_dict('records') |
|
|