linagora
/

Llamipa

+import os
+import numpy as np
+import jsonlines
+from collections import defaultdict
+from sklearn.metrics import classification_report
+def get_links(sample_string, sample_index):
+    """
+    takes a sample string and returns a list of attach tuples
+    and a list of rel type strings
+    """
+    #MINECRAFT labels
+    labels = ['COM','CONTR','CORR','QAP','ACK','ELAB','CLARIFQ','COND','CONTIN',
+              'RES','EXPL','QELAB','ALT','NARR','CONFQ','SEQ']
+    split_list = [st.strip() for st in sample_string.split(' ')]
+    rel_list = []
+    attach_list = []
+    bad = 0
+    good = 0
+    for a in split_list:
+        s_tuple = None
+        rel = None
+        try:
+            s = a.split('(')[1].split(')')[0].split(',')
+            r = a.split('(')[0].strip()
+        except IndexError:
+            print('split error at ', sample_index)
+        else:
+            try:
+                s_tuple = (int(s[0]), int(s[1]))
+            except IndexError:
+                print('split error at ', sample_index)
+            except ValueError:
+                print('value error at ', sample_index)
+            if r in labels:
+                #make sure the label is well-formed
+                rel = r
+        if rel != None and s_tuple != None and (s_tuple[1] - s_tuple[0]) <= 15: #if using a DISTANCE cutoff
+        # if rel != None and s_tuple != None: #if not using a DISTANCE cutoff
+            attach_list.append((int(s[0]), int(s[1])))
+            rel_list.append(r)
+            good += 1
+        else:
+            bad += 1
+    #re-construct the full list
+    #a list of tuples (rel, x, y)
+    #but don't allow doubles!!
+    full_list = []
+    endpoints = []
+    for i, r in enumerate(attach_list):
+        if r not in endpoints:
+            endpoints.append(r)
+            full_list.append((rel_list[i], r[0], r[1]))
+    return endpoints, full_list, [good, bad]
+current_folder=os.getcwd()
+gold_path = '/path/to/jsonl'
+pred_path = '/path/to/llamipa_output.txt'
+save_results = '/path/to/eval_.txt' #to create
+#get predicted
+with open(pred_path, 'r') as txt:
+    text = txt.read().split('\n')
+pred_outputs = []
+for t in text:
+    if t.startswith(' ### DS:'):
+        sample = t.split('### DS:')[1].strip()
+        pred_outputs.append(sample)
+print(len(pred_outputs))
+#get gold
+gold_outputs = []
+with jsonlines.open(gold_path) as reader:
+    for obj in reader:
+        if not obj['sample'].startswith('NEW DIALOGUE'): #make sure to ignore incremental formatting
+            gold_outputs.append(obj['PS'])
+att_f1_l = []
+att_prec_l = []
+att_rec_l = []
+total_attach_tp = 0
+total_attach_fp = 0
+total_attach_fn = 0
+type_f1_l = []
+type_prec_l = []
+type_rec_l = []
+total_TP = []
+matrix_list = []
+bad_output = 0
+good_output = 0
+for i, s in enumerate(pred_outputs):
+    pred_att, pred_all, malform = get_links(s, i)
+    gold_att, gold_all, malform = get_links(gold_outputs[i], i)
+    bad_output += malform[1]
+    good_output += malform[0]
+    #calculate number of nulls there should be -- will use to check null count below
+    common = len(set(pred_att).intersection(set(gold_att)))
+    expected_nulls = (len(pred_att) - common) + (len(gold_att) - common)
+    #calculate the precision, recall, and f1 for the sample FOR ATTACHMENTS
+    if len(gold_att) > 0 and len(pred_att) > 0:
+        prec = len([e for e in pred_att if e in gold_att])/len(pred_att)
+        rec = len([e for e in pred_att if e in gold_att])/len(gold_att)
+        total_attach_tp += len([e for e in pred_att if e in gold_att])
+        total_attach_fp += len([e for e in pred_att if e not in gold_att])
+        total_attach_fn += len([e for e in gold_att if e not in pred_att])
+    else:
+        prec = 0
+        rec = 0
+    att_prec_l.append(prec)
+    att_rec_l.append(rec)
+    if prec+rec==0:
+        att_f1_l.append(0)
+    else:
+        att_f1_l.append(2*prec*rec/(prec+rec))
+    #calculate the precision, recall, and f1 for the sample FOR ATTACHMENTS+RELATION TYPE
+    if len(gold_all) > 0 and len(pred_all) > 0:
+        prec = len([e for e in pred_all if e in gold_all])/len(pred_all)
+        rec = len([e for e in pred_all if e in gold_all])/len(gold_all)
+    else:
+        prec = 0
+        rec = 0
+    type_prec_l.append(prec)
+    type_rec_l.append(rec)
+    if prec+rec==0:
+        type_f1_l.append(0)
+    else:
+        type_f1_l.append(2*prec*rec/(prec+rec))
+    #create the relation comparisons by type
+    TP = [e for e in pred_all if e in gold_all]
+    leftover_pred = [p for p in pred_all if p not in TP]
+    leftover_gold = [p for p in gold_all if p not in TP]
+    #then process the TP, FP, FN for matrix
+    total_TP.extend(TP)
+    rem_dict = defaultdict(list)
+    for x in TP:
+        matrix_list.append([x[0], x[0]])
+    for x in leftover_pred:
+        rem_dict[(x[1], x[2])].append(('p', x[0]))
+    for x in leftover_gold:
+        rem_dict[(x[1], x[2])].append(('g', x[0]))
+    p_count = 0
+    g_count = 0
+    null_count = 0
+    for k in rem_dict.keys():
+        p = 'NULL'
+        t = 'NULL'
+        for re in rem_dict[k]:
+            if re[0] == 'p':
+                p = re[1]
+                p_count += 1
+            elif re[0] == 'g':
+                t = re[1]
+                g_count += 1
+        matrix_list.append([t,p])
+        if 'NULL' in [t,p]:
+            null_count += 1
+    assert(len(TP) + p_count == len(pred_all))
+    assert(len(TP) + g_count == len(gold_all))
+    assert null_count == expected_nulls
+#compute labels in gold and pred
+gold = [m[0] for m in matrix_list]
+pred = [m[1] for m in matrix_list]
+gold.extend(pred)
+labels = list(set(gold))
+microf1 = total_attach_tp/(total_attach_tp + 0.5*(total_attach_fp + total_attach_fn))
+gold_list = [labels.index(m[0]) for m in matrix_list]
+pred_list = [labels.index(m[1]) for m in matrix_list]
+f = open(save_results,"w")
+print("Attachment F1:",np.mean(att_f1_l),len(att_f1_l), file=f)
+print("Attachment Average Precision:",np.mean(att_prec_l), file=f)
+print("Attachment Average Recall:",np.mean(att_rec_l), file=f)
+print('Micro F1: ', microf1, file=f)
+print('--------------------------------', file=f)
+print("Attachment + Rel F1:",np.mean(type_f1_l),len(type_f1_l))
+print("Attachment + Rel Average Precision:",np.mean(type_prec_l))
+print("Attachment + Rel Average Recall:",np.mean(type_rec_l))
+print('---------------------------------------')
+print(classification_report(gold_list,pred_list,target_names=labels), file=f)
+# The F1-scores for the relation types displayed in the above table are correct.
+#That is, while calculating F1 for label l, all the ["NULL", l] entries count towards false-positive for label l
+#and all the [l, "NULL"] entries count towards false-negative for label l.
+#So, the "NULL" type is affecting the precision/recall/F1 for label l (as it should).
+#Now, for the overall weighted average precision/recall/f1-score,
+# we want the average to be over the actual relation labels set (i.e. excluding "NULL" class).
+#For that, we do this:
+d = classification_report(gold_list,pred_list,target_names=labels,output_dict=True)
+prec = 0
+rec = 0
+f1 = 0
+count = 0
+for label in labels:
+    if label!="NULL":
+        prec+=d[label]["precision"]*d[label]["support"]
+        rec+=d[label]["recall"]*d[label]["support"]
+        f1+=d[label]["f1-score"]*d[label]["support"]
+        count+=d[label]["support"]
+        # checking that support is same as the number of ground truth instance for the label
+        # assert d[label]["support"] == Counter(g_label_l)[label]
+print('--------------------------------', file=f)
+print("Weighted Average Precision:", prec/count, file=f)
+print("Weighted Average Recall:", rec/count, file=f)
+print("Weighted Average F1 score:", f1/count, file=f)
+f.close()