Spaces:

JetBrains-Research
/

commit-labeling

Sleeping

Petr Tsvetkov commited on Apr 24, 2024

Commit

0dd349d

1 Parent(s): a4dedae

Update the labeling app to use new data

Files changed (2) hide show

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import random
 import uuid
@@ -56,7 +57,7 @@ def update_commit_view(sample_ind):
     record = data[sample_ind]
-    diff_view = get_diff2html_view(convert_diff_to_unified(record['mods']))
     repo_val = record['repo']
     hash_val = record['hash']

+import json
 import os
 import random
 import uuid
     record = data[sample_ind]
+    diff_view = get_diff2html_view(convert_diff_to_unified(json.loads(record['mods'])))
     repo_val = record['repo']
     hash_val = record['hash']

data_loader.py CHANGED Viewed

@@ -1,29 +1,16 @@
 from datasets import load_dataset
-MODELS = [
-    'cmg_codellama13b-instruct', 'cmg_gpt_4_0613', 'cmg_deepseek-coder-33b-instruct']
 CACHE_DIR = 'cache'
 def load_data():
-    dataset = load_dataset("JetBrains-Research/lca-cmg",
-                           "commitchronicle-py-long",
-                           split="test",
-                           cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo']).rename(
-        columns={'message': 'reference'})
-    message_cols = ['reference']
-    for model in MODELS:
-        model_dataset = load_dataset("JetBrains-Research/lca-results",
-                                     model,
-                                     split="test",
-                                     cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo'])[["prediction"]]
-        model_dataset = model_dataset[~model_dataset.index.duplicated(keep='first')]
-        cur_col_name = f"{model}"
-        dataset = dataset.join(other=model_dataset).rename(columns={'prediction': cur_col_name})
-        message_cols.append(cur_col_name)
-    return dataset.reset_index().to_dict("records"), message_cols

+import os
 from datasets import load_dataset
+HF_TOKEN = os.environ.get('HF_TOKEN')
+MESSAGE_COLS = ['reference', 'prediction', 'enhanced']
 CACHE_DIR = 'cache'
 def load_data():
+    return load_dataset("JetBrains-Research/commit-labeling-samples",
+                        split="train",
+                        cache_dir=CACHE_DIR,
+                        token=HF_TOKEN).to_pandas().to_dict("records"), MESSAGE_COLS