Petr Tsvetkov commited on
Commit
0dd349d
·
1 Parent(s): a4dedae

Update the labeling app to use new data

Browse files
Files changed (2) hide show
  1. app.py +2 -1
  2. data_loader.py +9 -22
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import random
3
  import uuid
@@ -56,7 +57,7 @@ def update_commit_view(sample_ind):
56
 
57
  record = data[sample_ind]
58
 
59
- diff_view = get_diff2html_view(convert_diff_to_unified(record['mods']))
60
 
61
  repo_val = record['repo']
62
  hash_val = record['hash']
 
1
+ import json
2
  import os
3
  import random
4
  import uuid
 
57
 
58
  record = data[sample_ind]
59
 
60
+ diff_view = get_diff2html_view(convert_diff_to_unified(json.loads(record['mods'])))
61
 
62
  repo_val = record['repo']
63
  hash_val = record['hash']
data_loader.py CHANGED
@@ -1,29 +1,16 @@
 
 
1
  from datasets import load_dataset
2
 
3
- MODELS = [
4
- 'cmg_codellama13b-instruct', 'cmg_gpt_4_0613', 'cmg_deepseek-coder-33b-instruct']
 
5
 
6
  CACHE_DIR = 'cache'
7
 
8
 
9
  def load_data():
10
- dataset = load_dataset("JetBrains-Research/lca-cmg",
11
- "commitchronicle-py-long",
12
- split="test",
13
- cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo']).rename(
14
- columns={'message': 'reference'})
15
-
16
- message_cols = ['reference']
17
-
18
- for model in MODELS:
19
- model_dataset = load_dataset("JetBrains-Research/lca-results",
20
- model,
21
- split="test",
22
- cache_dir=CACHE_DIR).to_pandas().set_index(['hash', 'repo'])[["prediction"]]
23
- model_dataset = model_dataset[~model_dataset.index.duplicated(keep='first')]
24
-
25
- cur_col_name = f"{model}"
26
- dataset = dataset.join(other=model_dataset).rename(columns={'prediction': cur_col_name})
27
- message_cols.append(cur_col_name)
28
-
29
- return dataset.reset_index().to_dict("records"), message_cols
 
1
+ import os
2
+
3
  from datasets import load_dataset
4
 
5
+ HF_TOKEN = os.environ.get('HF_TOKEN')
6
+
7
+ MESSAGE_COLS = ['reference', 'prediction', 'enhanced']
8
 
9
  CACHE_DIR = 'cache'
10
 
11
 
12
  def load_data():
13
+ return load_dataset("JetBrains-Research/commit-labeling-samples",
14
+ split="train",
15
+ cache_dir=CACHE_DIR,
16
+ token=HF_TOKEN).to_pandas().to_dict("records"), MESSAGE_COLS