Petr Tsvetkov
commited on
Commit
•
928b43c
1
Parent(s):
b681eac
Switch to the special commit rewriting dataset
Browse files- app.py +4 -1
- data_loader.py +8 -15
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import os
|
2 |
import random
|
3 |
import uuid
|
@@ -19,7 +20,9 @@ n_samples = len(data)
|
|
19 |
saver = get_dataset_saver(HF_TOKEN, HF_DATASET, private=True)
|
20 |
|
21 |
|
22 |
-
def convert_diff_to_unified(
|
|
|
|
|
23 |
result = "\n".join(
|
24 |
[
|
25 |
f'--- {modified_file["old_path"]}\n'
|
|
|
1 |
+
import json
|
2 |
import os
|
3 |
import random
|
4 |
import uuid
|
|
|
20 |
saver = get_dataset_saver(HF_TOKEN, HF_DATASET, private=True)
|
21 |
|
22 |
|
23 |
+
def convert_diff_to_unified(diff_string):
|
24 |
+
diff = json.loads(diff_string)
|
25 |
+
|
26 |
result = "\n".join(
|
27 |
[
|
28 |
f'--- {modified_file["old_path"]}\n'
|
data_loader.py
CHANGED
@@ -1,22 +1,15 @@
|
|
|
|
|
|
1 |
from datasets import load_dataset
|
2 |
|
3 |
-
MODEL = 'cmg_gpt_4_0613'
|
4 |
CACHE_DIR = 'cache'
|
|
|
5 |
|
6 |
|
7 |
def load_data():
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
columns={'message': 'reference'})
|
13 |
-
|
14 |
-
model_dataset = load_dataset("JetBrains-Research/lca-results",
|
15 |
-
MODEL,
|
16 |
-
split="test",
|
17 |
-
cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo'])[["prediction"]]
|
18 |
-
|
19 |
-
model_dataset = model_dataset[~model_dataset.index.duplicated(keep='first')]
|
20 |
-
dataset = dataset.join(other=model_dataset)
|
21 |
|
22 |
-
return
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
from datasets import load_dataset
|
4 |
|
|
|
5 |
CACHE_DIR = 'cache'
|
6 |
+
N_SAMPLES = 15
|
7 |
|
8 |
|
9 |
def load_data():
|
10 |
+
df = load_dataset("petrtsv-jb/commit-rewriting-samples",
|
11 |
+
split="train",
|
12 |
+
token=os.environ.get('HF_REWRITING_TOKEN'),
|
13 |
+
cache_dir=CACHE_DIR).to_pandas()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
+
return df.to_dict('records')[:N_SAMPLES]
|