Spaces:

JetBrains-Research
/

commit-message-editing

Sleeping

Petr Tsvetkov

Some fixes

9737991 9 months ago

837 Bytes

	from datasets import load_dataset

	MODEL = 'cmg_gpt_4_0613'
	CACHE_DIR = 'cache'


	def load_data():
	dataset = load_dataset("JetBrains-Research/lca-cmg",
	"commitchronicle-py-long",
	split="test",
	cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo']).rename(
	columns={'message': 'reference'})

	model_dataset = load_dataset("JetBrains-Research/lca-results",
	MODEL,
	split="test",
	cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo'])[["prediction"]]

	model_dataset = model_dataset[~model_dataset.index.duplicated(keep='first')]
	dataset = dataset.join(other=model_dataset)

	return dataset.reset_index().to_dict('records')