commit-message-editing / data_loader.py
Petr Tsvetkov
Some fixes
9737991
raw
history blame
837 Bytes
from datasets import load_dataset
MODEL = 'cmg_gpt_4_0613'
CACHE_DIR = 'cache'
def load_data():
dataset = load_dataset("JetBrains-Research/lca-cmg",
"commitchronicle-py-long",
split="test",
cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo']).rename(
columns={'message': 'reference'})
model_dataset = load_dataset("JetBrains-Research/lca-results",
MODEL,
split="test",
cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo'])[["prediction"]]
model_dataset = model_dataset[~model_dataset.index.duplicated(keep='first')]
dataset = dataset.join(other=model_dataset)
return dataset.reset_index().to_dict('records')