Petr Tsvetkov commited on
Commit
928b43c
1 Parent(s): b681eac

Switch to the special commit rewriting dataset

Browse files
Files changed (2) hide show
  1. app.py +4 -1
  2. data_loader.py +8 -15
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import random
3
  import uuid
@@ -19,7 +20,9 @@ n_samples = len(data)
19
  saver = get_dataset_saver(HF_TOKEN, HF_DATASET, private=True)
20
 
21
 
22
- def convert_diff_to_unified(diff):
 
 
23
  result = "\n".join(
24
  [
25
  f'--- {modified_file["old_path"]}\n'
 
1
+ import json
2
  import os
3
  import random
4
  import uuid
 
20
  saver = get_dataset_saver(HF_TOKEN, HF_DATASET, private=True)
21
 
22
 
23
+ def convert_diff_to_unified(diff_string):
24
+ diff = json.loads(diff_string)
25
+
26
  result = "\n".join(
27
  [
28
  f'--- {modified_file["old_path"]}\n'
data_loader.py CHANGED
@@ -1,22 +1,15 @@
 
 
1
  from datasets import load_dataset
2
 
3
- MODEL = 'cmg_gpt_4_0613'
4
  CACHE_DIR = 'cache'
 
5
 
6
 
7
  def load_data():
8
- dataset = load_dataset("JetBrains-Research/lca-cmg",
9
- "commitchronicle-py-long",
10
- split="test",
11
- cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo']).rename(
12
- columns={'message': 'reference'})
13
-
14
- model_dataset = load_dataset("JetBrains-Research/lca-results",
15
- MODEL,
16
- split="test",
17
- cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo'])[["prediction"]]
18
-
19
- model_dataset = model_dataset[~model_dataset.index.duplicated(keep='first')]
20
- dataset = dataset.join(other=model_dataset)
21
 
22
- return dataset.reset_index().to_dict('records')
 
1
+ import os
2
+
3
  from datasets import load_dataset
4
 
 
5
  CACHE_DIR = 'cache'
6
+ N_SAMPLES = 15
7
 
8
 
9
  def load_data():
10
+ df = load_dataset("petrtsv-jb/commit-rewriting-samples",
11
+ split="train",
12
+ token=os.environ.get('HF_REWRITING_TOKEN'),
13
+ cache_dir=CACHE_DIR).to_pandas()
 
 
 
 
 
 
 
 
 
14
 
15
+ return df.to_dict('records')[:N_SAMPLES]