Spaces:

JetBrains-Research
/

commit-message-editing

Running

Petr Tsvetkov commited on Mar 25

Commit

928b43c

•

1 Parent(s): b681eac

Switch to the special commit rewriting dataset

Files changed (2) hide show

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import random
 import uuid
@@ -19,7 +20,9 @@ n_samples = len(data)
 saver = get_dataset_saver(HF_TOKEN, HF_DATASET, private=True)
-def convert_diff_to_unified(diff):
     result = "\n".join(
         [
             f'--- {modified_file["old_path"]}\n'

+import json
 import os
 import random
 import uuid
 saver = get_dataset_saver(HF_TOKEN, HF_DATASET, private=True)
+def convert_diff_to_unified(diff_string):
+    diff = json.loads(diff_string)
     result = "\n".join(
         [
             f'--- {modified_file["old_path"]}\n'

data_loader.py CHANGED Viewed

@@ -1,22 +1,15 @@
 from datasets import load_dataset
-MODEL = 'cmg_gpt_4_0613'
 CACHE_DIR = 'cache'
 def load_data():
-    dataset = load_dataset("JetBrains-Research/lca-cmg",
-                           "commitchronicle-py-long",
-                           split="test",
-                           cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo']).rename(
-        columns={'message': 'reference'})
-    model_dataset = load_dataset("JetBrains-Research/lca-results",
-                                 MODEL,
-                                 split="test",
-                                 cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo'])[["prediction"]]
-    model_dataset = model_dataset[~model_dataset.index.duplicated(keep='first')]
-    dataset = dataset.join(other=model_dataset)
-    return dataset.reset_index().to_dict('records')

+import os
 from datasets import load_dataset
 CACHE_DIR = 'cache'
+N_SAMPLES = 15
 def load_data():
+    df = load_dataset("petrtsv-jb/commit-rewriting-samples",
+                      split="train",
+                      token=os.environ.get('HF_REWRITING_TOKEN'),
+                      cache_dir=CACHE_DIR).to_pandas()
+    return df.to_dict('records')[:N_SAMPLES]