Petr Tsvetkov commited on
Commit
14bb44e
1 Parent(s): bc37ba1

New data loader module added

Browse files
Files changed (2) hide show
  1. .gitignore +1 -1
  2. data_loader.py +29 -0
.gitignore CHANGED
@@ -277,6 +277,6 @@ pip-selfcheck.json
277
 
278
  .idea
279
 
280
- data
281
  feedback
282
  flagged
 
277
 
278
  .idea
279
 
280
+ cache
281
  feedback
282
  flagged
data_loader.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+
3
+ MODELS = [
4
+ 'cmg_codellama13b-instruct', 'cmg_gpt_4_0613', 'deepseek-coder-33b-instruct']
5
+
6
+ CACHE_DIR = 'cache'
7
+
8
+
9
+ def load_data():
10
+ dataset = load_dataset("JetBrains-Research/lca-cmg",
11
+ "commitchronicle-py-long",
12
+ split="test",
13
+ cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo']).rename(
14
+ columns={'message': 'reference'})
15
+
16
+ message_cols = ['reference']
17
+
18
+ for model in MODELS:
19
+ model_dataset = load_dataset("JetBrains-Research/lca-results",
20
+ model,
21
+ split="test",
22
+ cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo'])[["prediction"]]
23
+ model_dataset = model_dataset[~model_dataset.index.duplicated(keep='first')]
24
+
25
+ cur_col_name = f"{model}"
26
+ dataset = dataset.join(other=model_dataset).rename(columns={'prediction': cur_col_name})
27
+ message_cols.append(cur_col_name)
28
+
29
+ return dataset.reset_index().to_dict("records"), message_cols