Spaces:
Sleeping
Sleeping
made simple functional streamlit app to host the model
Browse files- app.py +38 -0
- models/2d_ridge_roberta-suicide-regchain-pca-final.pkl +0 -0
- requirements.txt +5 -0
- src/__init__.py +0 -0
- src/__pycache__/__init__.cpython-38.pyc +0 -0
- src/__pycache__/class_eval.cpython-38.pyc +0 -0
- src/__pycache__/data.cpython-38.pyc +0 -0
- src/__pycache__/embeddings.cpython-38.pyc +0 -0
- src/__pycache__/eval.cpython-38.pyc +0 -0
- src/__pycache__/multiregression.cpython-38.pyc +0 -0
- src/__pycache__/roberta_regressor.cpython-38.pyc +0 -0
- src/__pycache__/utils.cpython-38.pyc +0 -0
- src/berta_finetuning.py +28 -0
- src/class_eval.py +576 -0
- src/data.py +104 -0
- src/embeddings.py +49 -0
- src/eval.py +195 -0
- src/models.py +5 -0
- src/multiregression.py +108 -0
- src/roberta_regressor.py +196 -0
- src/train.py +92 -0
- src/utils.py +62 -0
app.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pickle
|
| 3 |
+
import numpy as np
|
| 4 |
+
import os, glob, json, sys
|
| 5 |
+
import pickle
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import numpy as np
|
| 8 |
+
from sentence_transformers import SentenceTransformer
|
| 9 |
+
|
| 10 |
+
from src import data, utils
|
| 11 |
+
from src.embeddings import EmbeddingsRegressor
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# load the models
|
| 15 |
+
with open('models/2d_ridge_roberta-suicide-regchain-pca-final.pkl', 'rb') as f:
|
| 16 |
+
regressor = pickle.load(f)
|
| 17 |
+
|
| 18 |
+
model_name = 'hackathon-somos-nlp-2023/roberta-base-bne-finetuned-suicide-es'
|
| 19 |
+
tokenizer = SentenceTransformer(model_name)
|
| 20 |
+
model = EmbeddingsRegressor(tokenizer, regressor, normalize_output=True)
|
| 21 |
+
predict = utils.make_predict(model.predict)
|
| 22 |
+
|
| 23 |
+
# model_selector = st.sidebar.selectbox(
|
| 24 |
+
# 'Select model:',
|
| 25 |
+
# ['roberta', 'roberta_seq_multi', 'roberta_seq_multi_2']
|
| 26 |
+
# )
|
| 27 |
+
|
| 28 |
+
text_input = st.text_input('Enter your text here:')
|
| 29 |
+
if text_input:
|
| 30 |
+
prediction = predict([text_input]).tolist()
|
| 31 |
+
prediction = np.array(prediction).reshape(-1,4)
|
| 32 |
+
prediction = utils.normalize(prediction)
|
| 33 |
+
preds_df = data.make_task_labels_from_d(prediction, include_d=True).rename(
|
| 34 |
+
columns={c:'d_'+c.replace('+','_').replace('|','_') for c in data.task_d_cols}
|
| 35 |
+
)
|
| 36 |
+
preds_df['b_label'] = np.clip(preds_df['b_label'], 0, 1)
|
| 37 |
+
# show the dataframe
|
| 38 |
+
table = st.table(preds_df)
|
models/2d_ridge_roberta-suicide-regchain-pca-final.pkl
ADDED
|
Binary file (154 kB). View file
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
transformers
|
| 2 |
+
sentence-transformers
|
| 3 |
+
pandas
|
| 4 |
+
streamlit
|
| 5 |
+
scikit-learn>=1.2.1
|
src/__init__.py
ADDED
|
File without changes
|
src/__pycache__/__init__.cpython-38.pyc
ADDED
|
Binary file (172 Bytes). View file
|
|
|
src/__pycache__/class_eval.cpython-38.pyc
ADDED
|
Binary file (15.7 kB). View file
|
|
|
src/__pycache__/data.cpython-38.pyc
ADDED
|
Binary file (4.34 kB). View file
|
|
|
src/__pycache__/embeddings.cpython-38.pyc
ADDED
|
Binary file (2.15 kB). View file
|
|
|
src/__pycache__/eval.cpython-38.pyc
ADDED
|
Binary file (7.51 kB). View file
|
|
|
src/__pycache__/multiregression.cpython-38.pyc
ADDED
|
Binary file (4.34 kB). View file
|
|
|
src/__pycache__/roberta_regressor.cpython-38.pyc
ADDED
|
Binary file (6.73 kB). View file
|
|
|
src/__pycache__/utils.cpython-38.pyc
ADDED
|
Binary file (2.52 kB). View file
|
|
|
src/berta_finetuning.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import pipeline
|
| 2 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 3 |
+
from datasets import Dataset, load_dataset#, Features, Value, ClassLabe
|
| 4 |
+
|
| 5 |
+
ds = load_dataset('nlpUc3mStudents/mental-risk-c')
|
| 6 |
+
# to pandas
|
| 7 |
+
train_df = ds['train'].to_pandas()
|
| 8 |
+
test_df = ds['test'].to_pandas()
|
| 9 |
+
label_names = train_df.iloc[:,4:].columns.tolist()
|
| 10 |
+
# concat messages by subject id
|
| 11 |
+
train_by_subjectid = (
|
| 12 |
+
train_df.groupby('subject_id')
|
| 13 |
+
.agg({'message': lambda x: ' | '.join(x), **{col: 'first' for col in label_names}})
|
| 14 |
+
.reset_index()
|
| 15 |
+
# .assign(
|
| 16 |
+
# num_messages=lambda x: x.message.str.count('\|') + 1
|
| 17 |
+
# )
|
| 18 |
+
)
|
| 19 |
+
# back to datasets
|
| 20 |
+
train_df = Dataset.from_pandas(train_by_subjectid)
|
| 21 |
+
|
| 22 |
+
model_name= 'hackathon-somos-nlp-2023/roberta-base-bne-finetuned-suicide-es'
|
| 23 |
+
|
| 24 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 25 |
+
# this model is trained with 2 labels, yet we need 4, so we need to change the head
|
| 26 |
+
model = None
|
| 27 |
+
|
| 28 |
+
|
src/class_eval.py
ADDED
|
@@ -0,0 +1,576 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#This file has been developed by the SINAI research group for its usage in the MentalRiskES evaluation campaign at IberLEF 2023.
|
| 2 |
+
|
| 3 |
+
# Required libraries
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import numpy as np
|
| 6 |
+
import sklearn.metrics as metrics
|
| 7 |
+
from scipy.stats import pearsonr
|
| 8 |
+
|
| 9 |
+
# Read Gold labels for BinaryClassification
|
| 10 |
+
def read_qrels(qrels_file):
|
| 11 |
+
qrels={}
|
| 12 |
+
df_golden_truth = pd.read_csv(qrels_file)
|
| 13 |
+
for index, r in df_golden_truth.iterrows():
|
| 14 |
+
qrels[ r['Subject'] ] = int(r['label'])
|
| 15 |
+
print("\n"+str(len(qrels))+ " lines read in qrels file!\n\n")
|
| 16 |
+
return(qrels)
|
| 17 |
+
|
| 18 |
+
# Read Gold labels for Simple Regression
|
| 19 |
+
def read_qrels_regression(qrels_file):
|
| 20 |
+
qrels={}
|
| 21 |
+
df_golden_truth = pd.read_csv(qrels_file)
|
| 22 |
+
for index, r in df_golden_truth.iterrows():
|
| 23 |
+
qrels[ r['Subject'] ] = float(r['label'])
|
| 24 |
+
print("\n"+str(len(qrels))+ " lines read in qrels file!\n\n")
|
| 25 |
+
return(qrels)
|
| 26 |
+
|
| 27 |
+
# Read Gold labels for Multiclass classification
|
| 28 |
+
def read_qrels_multiclass(qrels_file):
|
| 29 |
+
qrels={}
|
| 30 |
+
qrels1 = {}
|
| 31 |
+
df_golden_truth = pd.read_csv(qrels_file)
|
| 32 |
+
for index, r in df_golden_truth.iterrows():
|
| 33 |
+
qrels1[ r['Subject'] ] = r['label']
|
| 34 |
+
if "suffer" in r['label']:
|
| 35 |
+
qrels[ r['Subject'] ] = 1
|
| 36 |
+
else:
|
| 37 |
+
qrels[ r['Subject'] ] = 0
|
| 38 |
+
print("\n"+str(len(qrels))+ " lines read in qrels file!\n\n")
|
| 39 |
+
return qrels, qrels1
|
| 40 |
+
|
| 41 |
+
# Read Gold labels for Multi-output regression
|
| 42 |
+
def read_qrels_multioutput(qrels_file):
|
| 43 |
+
qrels={}
|
| 44 |
+
df_golden_truth = pd.read_csv(qrels_file)
|
| 45 |
+
for index, r in df_golden_truth.iterrows():
|
| 46 |
+
qrels[ r['Subject'] ] = [r['suffer_in_favour'],r['suffer_against'],r['suffer_other'],r['control']]
|
| 47 |
+
print("\n"+str(len(qrels))+ " lines read in qrels file!\n\n")
|
| 48 |
+
return qrels
|
| 49 |
+
|
| 50 |
+
###########################################################################
|
| 51 |
+
# Calculation of Binary classification metrics for Binary classification tasks
|
| 52 |
+
class BinaryClassification():
|
| 53 |
+
def __init__(self, task, data, qrels):
|
| 54 |
+
self.run_results = data
|
| 55 |
+
self.qrels_b = read_qrels(qrels)
|
| 56 |
+
self.task = task
|
| 57 |
+
pass
|
| 58 |
+
|
| 59 |
+
def penalty(self,delay):
|
| 60 |
+
if self.task == "1": # TCA
|
| 61 |
+
p = 0.0292 # trial
|
| 62 |
+
elif self.task == "2": # Depression
|
| 63 |
+
p = 0.0179 # trial
|
| 64 |
+
pen = -1.0 + 2.0/(1+np.exp(-p*(delay-1)))
|
| 65 |
+
return(pen)
|
| 66 |
+
|
| 67 |
+
def n_pos(self):
|
| 68 |
+
total_pos = 0
|
| 69 |
+
for key in self.qrels_b:
|
| 70 |
+
total_pos += self.qrels_b[key]
|
| 71 |
+
return(total_pos)
|
| 72 |
+
|
| 73 |
+
def eval_performance(self):
|
| 74 |
+
print("===================================================")
|
| 75 |
+
print("DECISION-BASED EVALUATION:")
|
| 76 |
+
self.run_results = self.run_results.sort_values(by=['nick'])
|
| 77 |
+
total_pos=self.n_pos()
|
| 78 |
+
erdes5 = np.zeros(len(self.run_results))
|
| 79 |
+
erdes30 = np.zeros(len(self.run_results))
|
| 80 |
+
erdes50 = np.zeros(len(self.run_results))
|
| 81 |
+
ierdes = 0
|
| 82 |
+
true_pos = 0
|
| 83 |
+
false_pos = 0
|
| 84 |
+
latency_tps = list()
|
| 85 |
+
penalty_tps = list()
|
| 86 |
+
|
| 87 |
+
# Latency-based metrics
|
| 88 |
+
for index, r in self.run_results.iterrows():
|
| 89 |
+
try:
|
| 90 |
+
|
| 91 |
+
if ( self.qrels_b[ r['nick'] ] == r['pred'] ):
|
| 92 |
+
if ( r['pred'] == 1 ):
|
| 93 |
+
true_pos+=1
|
| 94 |
+
erdes5[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 5.0)))
|
| 95 |
+
erdes30[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 30.0)))
|
| 96 |
+
erdes50[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 50.0)))
|
| 97 |
+
latency_tps.append(r["round"]+1)
|
| 98 |
+
penalty_tps.append(self.penalty(r["round"]+1))
|
| 99 |
+
else:
|
| 100 |
+
erdes5[ierdes]=0
|
| 101 |
+
erdes30[ierdes]=0
|
| 102 |
+
erdes50[ierdes]=0
|
| 103 |
+
else:
|
| 104 |
+
if ( r['pred'] == 1 ):
|
| 105 |
+
false_pos+=1
|
| 106 |
+
erdes5[ierdes]=float(total_pos)/float(len(self.qrels_b))
|
| 107 |
+
erdes50[ierdes]=float(total_pos)/float(len(self.qrels_b))
|
| 108 |
+
else:
|
| 109 |
+
erdes5[ierdes]=1
|
| 110 |
+
erdes30[ierdes]=1
|
| 111 |
+
erdes50[ierdes]=1
|
| 112 |
+
except KeyError:
|
| 113 |
+
print("User does not appear in the qrels:"+r['nick'])
|
| 114 |
+
ierdes+=1
|
| 115 |
+
|
| 116 |
+
_speed = 1-np.median(np.array(penalty_tps))
|
| 117 |
+
if true_pos != 0 :
|
| 118 |
+
precision = float(true_pos) / float(true_pos+false_pos)
|
| 119 |
+
recall = float(true_pos) / float(total_pos)
|
| 120 |
+
f1_erde = 2 * (precision * recall) / (precision + recall)
|
| 121 |
+
_latencyweightedF1 = f1_erde*_speed
|
| 122 |
+
else:
|
| 123 |
+
_latencyweightedF1 = 0
|
| 124 |
+
_speed = 0
|
| 125 |
+
|
| 126 |
+
y_true = self.run_results['pred'].tolist()
|
| 127 |
+
y_pred_b = list(self.qrels_b.values())
|
| 128 |
+
|
| 129 |
+
# Binary metrics
|
| 130 |
+
accuracy = metrics.accuracy_score(y_true, y_pred_b)
|
| 131 |
+
macro_precision = metrics.precision_score(y_true, y_pred_b, average='macro')
|
| 132 |
+
macro_recall = metrics.recall_score(y_true, y_pred_b, average='macro')
|
| 133 |
+
macro_f1 = metrics.f1_score(y_true, y_pred_b, average='macro')
|
| 134 |
+
micro_precision = metrics.precision_score(y_true, y_pred_b, average='micro')
|
| 135 |
+
micro_recall = metrics.recall_score(y_true, y_pred_b, average='micro')
|
| 136 |
+
micro_f1 = metrics.f1_score(y_true, y_pred_b, average='micro')
|
| 137 |
+
|
| 138 |
+
print("BINARY METRICS: =============================")
|
| 139 |
+
print("Accuracy:"+str(accuracy))
|
| 140 |
+
print("Macro precision:"+str(macro_precision))
|
| 141 |
+
print("Macro recall:"+str(macro_recall))
|
| 142 |
+
print("Macro f1:"+str(macro_f1))
|
| 143 |
+
print("Micro precision:"+str(micro_precision))
|
| 144 |
+
print("Micro recall:"+str(micro_recall))
|
| 145 |
+
print("Micro f1:"+str(micro_f1))
|
| 146 |
+
|
| 147 |
+
print("LATENCY-BASED METRICS: =============================")
|
| 148 |
+
print("ERDE_5:"+str(np.mean(erdes5)))
|
| 149 |
+
print("ERDE_50:"+str(np.mean(erdes50)))
|
| 150 |
+
print("Median latency:"+str(np.median(np.array(latency_tps))))
|
| 151 |
+
print("Speed:"+str(_speed))
|
| 152 |
+
print("latency-weightedF1:"+str(_latencyweightedF1))
|
| 153 |
+
|
| 154 |
+
return {'Accuracy': accuracy, 'Macro_P': macro_precision, 'Macro_R': macro_recall,'Macro_F1': macro_f1,'Micro_P': micro_precision, 'Micro_R': micro_recall,
|
| 155 |
+
'Micro_F1': micro_f1, 'ERDE5':np.mean(erdes5),'ERDE30': np.mean(erdes30),'ERDE50': np.mean(erdes50), 'latencyTP': np.median(np.array(latency_tps)),
|
| 156 |
+
'speed': _speed, 'latency-weightedF1': _latencyweightedF1}
|
| 157 |
+
|
| 158 |
+
# Calculation of P@10, P@20, P@30, P@50
|
| 159 |
+
def eval_performance_rank_based(self):
|
| 160 |
+
print("===================================================")
|
| 161 |
+
print("RANK-BASED EVALUATION:")
|
| 162 |
+
ranks_at=[1,50,75]
|
| 163 |
+
rank_dit = {}
|
| 164 |
+
for rank in ranks_at:
|
| 165 |
+
print("Analizing ranking at round "+str(rank))
|
| 166 |
+
rels_topk = [0,0,0,0]
|
| 167 |
+
self.run_results["label"] = self.qrels_b.values()
|
| 168 |
+
self.run_results = self.run_results.sort_values(by=['pred'],ascending=False)
|
| 169 |
+
i = 0
|
| 170 |
+
for index, r in self.run_results.iterrows():
|
| 171 |
+
if i<10:
|
| 172 |
+
if r["pred"] == r['label']:
|
| 173 |
+
rels_topk[0] += 1
|
| 174 |
+
rels_topk[1] += 1
|
| 175 |
+
rels_topk[2] += 1
|
| 176 |
+
rels_topk[3] += 1
|
| 177 |
+
elif i<20:
|
| 178 |
+
if r["pred"] == r['label']:
|
| 179 |
+
rels_topk[1] += 1
|
| 180 |
+
rels_topk[2] += 1
|
| 181 |
+
rels_topk[3] += 1
|
| 182 |
+
elif i<30:
|
| 183 |
+
if r["pred"] == r['label']:
|
| 184 |
+
rels_topk[2] += 1
|
| 185 |
+
rels_topk[3] += 1
|
| 186 |
+
elif i<50:
|
| 187 |
+
if r["pred"] == r['label']:
|
| 188 |
+
rels_topk[3] += 1
|
| 189 |
+
else:
|
| 190 |
+
break
|
| 191 |
+
i+=1
|
| 192 |
+
p10 = float(rels_topk[0])/10.0
|
| 193 |
+
p20 = float(rels_topk[1])/20.0
|
| 194 |
+
p30 = float(rels_topk[2])/30.0
|
| 195 |
+
p50 = float(rels_topk[3])/50.0
|
| 196 |
+
|
| 197 |
+
print("PRECISION AT K: =============================")
|
| 198 |
+
print("P@10:"+str(p10))
|
| 199 |
+
print("P@20:"+str(p20))
|
| 200 |
+
print("P@30:"+str(p30))
|
| 201 |
+
print("P@50:"+str(p50))
|
| 202 |
+
rank_dit[rank] = {"@10":p10,"@20":p20,"@30":p30,"@50":p50}
|
| 203 |
+
return rank_dit
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
#############################################################################################
|
| 207 |
+
# Calculation of Regression metrics for Simple regression tasks
|
| 208 |
+
class ClassRegressionEvaluation():
|
| 209 |
+
def __init__(self, task, data, qrels):
|
| 210 |
+
self.run_results = data
|
| 211 |
+
self.qrels = read_qrels_regression(qrels)
|
| 212 |
+
self.task = task
|
| 213 |
+
|
| 214 |
+
def eval_performance(self):
|
| 215 |
+
self.run_results = self.run_results.sort_values(by=['nick'])
|
| 216 |
+
y_true = self.run_results['pred'].tolist()
|
| 217 |
+
|
| 218 |
+
y_pred_r = list(self.qrels.values())
|
| 219 |
+
|
| 220 |
+
# Regression metrics
|
| 221 |
+
_rmse = metrics.mean_squared_error(y_true, y_pred_r, sample_weight=None, multioutput='raw_values', squared=False)[0]
|
| 222 |
+
_pearson = np.corrcoef(y_true, y_pred_r)
|
| 223 |
+
_pearson, _ = pearsonr(y_true, y_pred_r)
|
| 224 |
+
|
| 225 |
+
print("REGRESSION METRICS: =============================")
|
| 226 |
+
print("RMSE:"+str(_rmse))
|
| 227 |
+
print("Pearson correlation coefficient:"+str(_pearson))
|
| 228 |
+
|
| 229 |
+
return { 'RMSE:': _rmse, 'Pearson_coefficient': _pearson}
|
| 230 |
+
|
| 231 |
+
# Calculation of P@10, P@20, P@30, P@50
|
| 232 |
+
def eval_performance_rank_based(self):
|
| 233 |
+
print("===================================================")
|
| 234 |
+
print("RANK-BASED EVALUATION:")
|
| 235 |
+
ranks_at=[1,25,50,75]
|
| 236 |
+
rank_dit = {}
|
| 237 |
+
for rank in ranks_at:
|
| 238 |
+
print("Analizing ranking at round "+str(rank))
|
| 239 |
+
rels_topk = [0,0,0,0,0]
|
| 240 |
+
self.run_results_ = self.run_results[rank].sort_values(by=['nick'])
|
| 241 |
+
self.run_results_["label"] = self.qrels.values()
|
| 242 |
+
self.run_results_ = self.run_results_.sort_values(by=['pred'],ascending=False)
|
| 243 |
+
i = 0
|
| 244 |
+
for index, r in self.run_results_.iterrows():
|
| 245 |
+
if i<5:
|
| 246 |
+
if r["label"] == round(r["pred"],1):
|
| 247 |
+
rels_topk[0] += 1
|
| 248 |
+
rels_topk[1] += 1
|
| 249 |
+
rels_topk[2] += 1
|
| 250 |
+
rels_topk[3] += 1
|
| 251 |
+
rels_topk[4] += 1
|
| 252 |
+
elif i<10:
|
| 253 |
+
if r['label'] == round(r["pred"],1):
|
| 254 |
+
rels_topk[1] += 1
|
| 255 |
+
rels_topk[2] += 1
|
| 256 |
+
rels_topk[3] += 1
|
| 257 |
+
rels_topk[4] += 1
|
| 258 |
+
elif i<20:
|
| 259 |
+
if r['label'] == round(r["pred"],1):
|
| 260 |
+
rels_topk[2] += 1
|
| 261 |
+
rels_topk[3] += 1
|
| 262 |
+
rels_topk[4] += 1
|
| 263 |
+
elif i<30:
|
| 264 |
+
if r['label'] == round(r["pred"],1):
|
| 265 |
+
rels_topk[3] += 1
|
| 266 |
+
rels_topk[4] += 1
|
| 267 |
+
elif i<50:
|
| 268 |
+
if r['label'] == round(r["pred"],1):
|
| 269 |
+
rels_topk[4] += 1
|
| 270 |
+
else:
|
| 271 |
+
break
|
| 272 |
+
i+=1
|
| 273 |
+
p5 = float(rels_topk[0])/5.0
|
| 274 |
+
p10 = float(rels_topk[1])/10.0
|
| 275 |
+
p20 = float(rels_topk[2])/20.0
|
| 276 |
+
p30 = float(rels_topk[3])/30.0
|
| 277 |
+
p50 = float(rels_topk[4])/50.0
|
| 278 |
+
|
| 279 |
+
print("PRECISION AT K: =============================")
|
| 280 |
+
print("P@5:"+str(p5))
|
| 281 |
+
print("P@10:"+str(p10))
|
| 282 |
+
print("P@20:"+str(p20))
|
| 283 |
+
print("P@30:"+str(p30))
|
| 284 |
+
print("P@50:"+str(p50))
|
| 285 |
+
rank_dit[rank] = {"@5":p5,"@10":p10,"@20":p20,"@30":p30,"@50":p50}
|
| 286 |
+
return rank_dit
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
############################################################################
|
| 290 |
+
# Calculation of Binary metrics for Multiclass classification tasks
|
| 291 |
+
class BinaryMultiClassification():
|
| 292 |
+
def __init__(self, task, data, qrels):
|
| 293 |
+
self.run_results = data
|
| 294 |
+
self.qrels_b, self.qrels_multiclass = read_qrels_multiclass(qrels)
|
| 295 |
+
self.task = task
|
| 296 |
+
pass
|
| 297 |
+
|
| 298 |
+
def penalty(self,delay):
|
| 299 |
+
if self.task == "1": # TCA
|
| 300 |
+
p = 0.0411 # test
|
| 301 |
+
p = 0.0292 # trial
|
| 302 |
+
elif self.task == "2": # Depression
|
| 303 |
+
p = 0.0326 # test
|
| 304 |
+
p = 0.0179 # trial
|
| 305 |
+
else: # Unkown
|
| 306 |
+
p = 0.0308 # test
|
| 307 |
+
pen = -1.0 + 2.0/(1+np.exp(-p*(delay-1)))
|
| 308 |
+
return(pen)
|
| 309 |
+
|
| 310 |
+
def n_pos(self):
|
| 311 |
+
total_pos = 0
|
| 312 |
+
for key in self.qrels_b:
|
| 313 |
+
total_pos += self.qrels_b[key]
|
| 314 |
+
return(total_pos)
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
def eval_performance(self):
|
| 318 |
+
print("===================================================")
|
| 319 |
+
print("DECISION-BASED EVALUATION:")
|
| 320 |
+
self.run_results = self.run_results.sort_values(by=['nick'])
|
| 321 |
+
total_pos=self.n_pos() # Total number of positive documents
|
| 322 |
+
erdes5 = np.zeros(len(self.run_results))
|
| 323 |
+
erdes30 = np.zeros(len(self.run_results))
|
| 324 |
+
erdes50 = np.zeros(len(self.run_results))
|
| 325 |
+
ierdes = 0
|
| 326 |
+
true_pos = 0
|
| 327 |
+
false_pos = 0
|
| 328 |
+
latency_tps = list()
|
| 329 |
+
penalty_tps = list()
|
| 330 |
+
|
| 331 |
+
for index, r in self.run_results.iterrows():
|
| 332 |
+
try:
|
| 333 |
+
|
| 334 |
+
if ( self.qrels_b[ r['nick'] ] == r['pred_b'] ):
|
| 335 |
+
if ( r['pred_b'] == 1 ):
|
| 336 |
+
true_pos+=1
|
| 337 |
+
erdes5[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 5.0)))
|
| 338 |
+
erdes30[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 30.0)))
|
| 339 |
+
erdes50[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 50.0)))
|
| 340 |
+
latency_tps.append(r["round"]+1)
|
| 341 |
+
penalty_tps.append(self.penalty(r["round"]+1))
|
| 342 |
+
else:
|
| 343 |
+
erdes5[ierdes]=0
|
| 344 |
+
erdes30[ierdes]=0
|
| 345 |
+
erdes50[ierdes]=0
|
| 346 |
+
else:
|
| 347 |
+
if ( r['pred_b'] == 1 ):
|
| 348 |
+
false_pos+=1
|
| 349 |
+
erdes5[ierdes]=float(total_pos)/float(len(self.qrels_b))
|
| 350 |
+
erdes30[ierdes]=float(total_pos)/float(len(self.qrels_b))
|
| 351 |
+
erdes50[ierdes]=float(total_pos)/float(len(self.qrels_b))
|
| 352 |
+
else:
|
| 353 |
+
erdes5[ierdes]=1
|
| 354 |
+
erdes30[ierdes]=1
|
| 355 |
+
erdes50[ierdes]=1
|
| 356 |
+
except KeyError:
|
| 357 |
+
print("User does not appear in the qrels:"+r['nick'])
|
| 358 |
+
ierdes+=1
|
| 359 |
+
|
| 360 |
+
_speed = 1-np.median(np.array(penalty_tps))
|
| 361 |
+
if true_pos != 0 :
|
| 362 |
+
precision = float(true_pos) / float(true_pos+false_pos)
|
| 363 |
+
recall = float(true_pos) / float(total_pos)
|
| 364 |
+
f1_erde = 2 * (precision * recall) / (precision + recall)
|
| 365 |
+
_latencyweightedF1 = f1_erde*_speed
|
| 366 |
+
else:
|
| 367 |
+
_latencyweightedF1 = 0
|
| 368 |
+
_speed = 0
|
| 369 |
+
|
| 370 |
+
y_true = self.run_results['pred'].tolist()
|
| 371 |
+
y_pred_b = list(self.qrels_multiclass.values())
|
| 372 |
+
|
| 373 |
+
# Binary metrics
|
| 374 |
+
accuracy = metrics.accuracy_score(y_true, y_pred_b)
|
| 375 |
+
macro_precision = metrics.precision_score(y_true, y_pred_b, average='macro')
|
| 376 |
+
macro_recall = metrics.recall_score(y_true, y_pred_b, average='macro')
|
| 377 |
+
macro_f1 = metrics.f1_score(y_true, y_pred_b, average='macro')
|
| 378 |
+
micro_precision = metrics.precision_score(y_true, y_pred_b, average='micro')
|
| 379 |
+
micro_recall = metrics.recall_score(y_true, y_pred_b, average='micro')
|
| 380 |
+
micro_f1 = metrics.f1_score(y_true, y_pred_b, average='micro')
|
| 381 |
+
|
| 382 |
+
print("BINARY METRICS: =============================")
|
| 383 |
+
print("Accuracy:"+str(accuracy))
|
| 384 |
+
print("Macro precision:"+str(macro_precision))
|
| 385 |
+
print("Macro recall:"+str(macro_recall))
|
| 386 |
+
print("Macro f1:"+str(macro_f1))
|
| 387 |
+
print("Micro precision:"+str(micro_precision))
|
| 388 |
+
print("Micro recall:"+str(micro_recall))
|
| 389 |
+
print("Micro f1:"+str(micro_f1))
|
| 390 |
+
|
| 391 |
+
print("LATENCY-BASED METRICS: =============================")
|
| 392 |
+
print("ERDE_5:"+str(np.mean(erdes5)))
|
| 393 |
+
print("ERDE_50:"+str(np.mean(erdes50)))
|
| 394 |
+
print("Median latency:"+str(np.median(np.array(latency_tps))))
|
| 395 |
+
print("Speed:"+str(_speed))
|
| 396 |
+
print("latency-weightedF1:"+str(_latencyweightedF1))
|
| 397 |
+
|
| 398 |
+
return {'Accuracy': accuracy, 'Macro_P': macro_precision, 'Macro_R': macro_recall,'Macro_F1': macro_f1,'Micro_P': micro_precision, 'Micro_R': micro_recall,
|
| 399 |
+
'Micro_F1': micro_f1, 'ERDE5':np.mean(erdes5),'ERDE30':np.mean(erdes30),'ERDE50': np.mean(erdes50), 'latencyTP': np.median(np.array(latency_tps)),
|
| 400 |
+
'speed': _speed, 'latency-weightedF1': _latencyweightedF1}
|
| 401 |
+
|
| 402 |
+
# Calculation of P@10, P@20, P@30, P@50
|
| 403 |
+
def eval_performance_rank_based(self):
|
| 404 |
+
print("===================================================")
|
| 405 |
+
print("PRECISION AT K - EVALUATION:")
|
| 406 |
+
ranks_at=[1,50,75]
|
| 407 |
+
rank_dit = {}
|
| 408 |
+
for rank in ranks_at:
|
| 409 |
+
print("Analizing ranking at round "+str(rank))
|
| 410 |
+
rels_topk = [0,0,0,0]
|
| 411 |
+
self.run_results["label"] = self.qrels_b.values()
|
| 412 |
+
self.run_results = self.run_results.sort_values(by=['pred_b'],ascending=False)
|
| 413 |
+
i = 0
|
| 414 |
+
for index, r in self.run_results.iterrows():
|
| 415 |
+
if i<10:
|
| 416 |
+
if r["pred_b"] == r['label']:
|
| 417 |
+
rels_topk[0] += 1
|
| 418 |
+
rels_topk[1] += 1
|
| 419 |
+
rels_topk[2] += 1
|
| 420 |
+
rels_topk[3] += 1
|
| 421 |
+
elif i<20:
|
| 422 |
+
if r["pred_b"] == r['label']:
|
| 423 |
+
rels_topk[1] += 1
|
| 424 |
+
rels_topk[2] += 1
|
| 425 |
+
rels_topk[3] += 1
|
| 426 |
+
elif i<30:
|
| 427 |
+
if r["pred_b"] == r['label']:
|
| 428 |
+
rels_topk[2] += 1
|
| 429 |
+
rels_topk[3] += 1
|
| 430 |
+
elif i<50:
|
| 431 |
+
if r["pred_b"] == r['label']:
|
| 432 |
+
rels_topk[3] += 1
|
| 433 |
+
else:
|
| 434 |
+
break
|
| 435 |
+
i+=1
|
| 436 |
+
p10 = float(rels_topk[0])/10.0
|
| 437 |
+
p20 = float(rels_topk[1])/20.0
|
| 438 |
+
p30 = float(rels_topk[2])/30.0
|
| 439 |
+
p50 = float(rels_topk[3])/50.0
|
| 440 |
+
|
| 441 |
+
print("PRECISION AT K: =============================")
|
| 442 |
+
print("P@10:"+str(p10))
|
| 443 |
+
print("P@20:"+str(p20))
|
| 444 |
+
print("P@30:"+str(p30))
|
| 445 |
+
print("P@50:"+str(p50))
|
| 446 |
+
rank_dit[rank] = {"@10":p10,"@20":p20,"@30":p30,"@50":p50}
|
| 447 |
+
return rank_dit
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
#######################################################################################
|
| 451 |
+
# Calculation of Regression metrics for Multi-output regression tasks
|
| 452 |
+
class ClassMultiRegressionEvaluation():
|
| 453 |
+
|
| 454 |
+
def __init__(self, task, data, qrels):
|
| 455 |
+
self.run_results = data
|
| 456 |
+
self.qrels = read_qrels_multioutput(qrels)
|
| 457 |
+
self.task = task
|
| 458 |
+
|
| 459 |
+
def eval_performance(self):
|
| 460 |
+
self.run_results = self.run_results.sort_values(by=['nick'])
|
| 461 |
+
y_true = self.run_results['pred'].tolist()
|
| 462 |
+
y_pred_r = list(self.qrels.values())
|
| 463 |
+
|
| 464 |
+
# Regression metrics
|
| 465 |
+
_rmse = metrics.mean_squared_error(y_true, y_pred_r, sample_weight=None, multioutput='raw_values', squared=False)[0]
|
| 466 |
+
_pearson_sf, _ = pearsonr([item[0] for item in y_true] , [item[0] for item in y_pred_r])
|
| 467 |
+
_pearson_sa, _ = pearsonr([item[1] for item in y_true] , [item[1] for item in y_pred_r])
|
| 468 |
+
_pearson_so, _ = pearsonr([item[2] for item in y_true] , [item[2] for item in y_pred_r])
|
| 469 |
+
_pearson_c, _ = pearsonr([item[3] for item in y_true] , [item[3] for item in y_pred_r])
|
| 470 |
+
|
| 471 |
+
print("REGRESSION METRICS: =============================")
|
| 472 |
+
print("RMSE:"+str(_rmse))
|
| 473 |
+
print("Pearson correlation coefficient:")
|
| 474 |
+
print("Pearson sf:"+str(_pearson_sf))
|
| 475 |
+
print("Pearson sa:"+str(_pearson_sa))
|
| 476 |
+
print("Pearson so:"+str(_pearson_so))
|
| 477 |
+
print("Pearson c:"+str(_pearson_c))
|
| 478 |
+
pearson = (_pearson_sf + _pearson_sa + _pearson_so + _pearson_c)/4
|
| 479 |
+
return { 'RMSE:': _rmse, 'Pearson_mean': pearson,'Pearson_sf': _pearson_sf, 'Pearson_sa': _pearson_sa,'Pearson_so': _pearson_so,'Pearson_c': _pearson_c}
|
| 480 |
+
|
| 481 |
+
# Calculation of P@10, P@20, P@30, P@50
|
| 482 |
+
def eval_performance_rank_based(self):
|
| 483 |
+
print("===================================================")
|
| 484 |
+
print("PRECISION AT - EVALUATION:")
|
| 485 |
+
ranks_at=[1,25,50,75]
|
| 486 |
+
rank_dit = {}
|
| 487 |
+
for rank in ranks_at:
|
| 488 |
+
print("Analizing ranking at round "+str(rank))
|
| 489 |
+
self.run_results_ = self.run_results[rank].sort_values(by=['nick'])
|
| 490 |
+
self.run_results_["label"] = self.qrels.values()
|
| 491 |
+
self.run_results_ = self.run_results_.sort_values(by=['pred'],ascending=False)
|
| 492 |
+
p5 = 0
|
| 493 |
+
p10 = 0
|
| 494 |
+
p20 = 0
|
| 495 |
+
p30 = 0
|
| 496 |
+
p50 = 0
|
| 497 |
+
for j in range(0,4):
|
| 498 |
+
rels_topk = [0,0,0,0,0]
|
| 499 |
+
i = 0
|
| 500 |
+
for index, r in self.run_results_.iterrows():
|
| 501 |
+
if i<5:
|
| 502 |
+
if r['label'][j] == round(r["pred"][j],1):
|
| 503 |
+
rels_topk[0] += 1
|
| 504 |
+
rels_topk[1] += 1
|
| 505 |
+
rels_topk[2] += 1
|
| 506 |
+
rels_topk[3] += 1
|
| 507 |
+
rels_topk[4] += 1
|
| 508 |
+
elif i<10:
|
| 509 |
+
if r['label'][j] == round(r["pred"][j],1):
|
| 510 |
+
rels_topk[0] += 1
|
| 511 |
+
rels_topk[1] += 1
|
| 512 |
+
rels_topk[2] += 1
|
| 513 |
+
rels_topk[3] += 1
|
| 514 |
+
elif i<20:
|
| 515 |
+
if r['label'][j] == round(r["pred"][j],1):
|
| 516 |
+
rels_topk[1] += 1
|
| 517 |
+
rels_topk[2] += 1
|
| 518 |
+
rels_topk[3] += 1
|
| 519 |
+
elif i<30:
|
| 520 |
+
if r['label'][j] == round(r["pred"][j],1):
|
| 521 |
+
rels_topk[2] += 1
|
| 522 |
+
rels_topk[3] += 1
|
| 523 |
+
elif i<50:
|
| 524 |
+
if r['label'][j] == round(r["pred"][j],1):
|
| 525 |
+
rels_topk[3] += 1
|
| 526 |
+
else:
|
| 527 |
+
break
|
| 528 |
+
i+=1
|
| 529 |
+
p5 += float(rels_topk[0])/5.0
|
| 530 |
+
p10 += float(rels_topk[0])/10.0
|
| 531 |
+
p20 += float(rels_topk[1])/20.0
|
| 532 |
+
p30 += float(rels_topk[2])/30.0
|
| 533 |
+
p50 += float(rels_topk[3])/50.0
|
| 534 |
+
|
| 535 |
+
print("PRECISION AT K: =============================")
|
| 536 |
+
print("P@5:"+str(p5/4))
|
| 537 |
+
print("P@10:"+str(p10/4))
|
| 538 |
+
print("P@20:"+str(p20/4))
|
| 539 |
+
print("P@30:"+str(p30/4))
|
| 540 |
+
print("P@50:"+str(p50/4))
|
| 541 |
+
rank_dit[rank] = {"@5":p5/4,"@10":p10/4,"@20":p20/4,"@30":p30/4,"@50":p50/4}
|
| 542 |
+
return rank_dit
|
| 543 |
+
|
| 544 |
+
|
| 545 |
+
# Class for calculating carbon emission values
|
| 546 |
+
class Emissions():
|
| 547 |
+
def __init__(self, emissions_run) -> None:
|
| 548 |
+
self.emissions_run = emissions_run
|
| 549 |
+
self.aux = {}
|
| 550 |
+
for key, value in emissions_run.items():
|
| 551 |
+
self.aux[key] = 0
|
| 552 |
+
pass
|
| 553 |
+
|
| 554 |
+
# Update of values after a prediction has been made
|
| 555 |
+
def update_emissions(self,emissions_round):
|
| 556 |
+
# The values are accumulated in each round, so the difference is calculated to know the values for that round only
|
| 557 |
+
for key, value in self.emissions_run.items():
|
| 558 |
+
if key not in ["cpu_count","gpu_count","cpu_model","gpu_model", "ram_total_size"]:
|
| 559 |
+
round_ = emissions_round[key] - self.aux[key]
|
| 560 |
+
self.emissions_run[key].append(round_)
|
| 561 |
+
self.aux[key] = emissions_round[key]
|
| 562 |
+
|
| 563 |
+
# Calculation of final values after all predictions have been made
|
| 564 |
+
def calculate_emissions(self):
|
| 565 |
+
dict_ = {}
|
| 566 |
+
for key, value in self.emissions_run.items():
|
| 567 |
+
# Non-numerical values
|
| 568 |
+
if key in ["cpu_count","gpu_count","cpu_model","gpu_model", "ram_total_size"]:
|
| 569 |
+
dict_[key] = self.emissions_run[key][0]
|
| 570 |
+
# Numerical values
|
| 571 |
+
else:
|
| 572 |
+
dict_[key+"_min"] = min(self.emissions_run[key])
|
| 573 |
+
dict_[key+"_max"] = max(self.emissions_run[key])
|
| 574 |
+
dict_[key+"_mean"] = sum(self.emissions_run[key])/len(self.emissions_run[key])
|
| 575 |
+
dict_[key+"_var"] = np.var(self.emissions_run[key])
|
| 576 |
+
return dict_
|
src/data.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests, os, glob
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
train_dir = "./data/train"
|
| 6 |
+
test_dir = "./data/test"
|
| 7 |
+
truth_dir = "golden_truth"
|
| 8 |
+
|
| 9 |
+
def load(set_name:str='train', with_labels:bool=True) -> pd.DataFrame:
|
| 10 |
+
"""
|
| 11 |
+
Load the data from the csv files
|
| 12 |
+
"""
|
| 13 |
+
if set_name == 'train':
|
| 14 |
+
path = train_dir
|
| 15 |
+
elif set_name == 'test':
|
| 16 |
+
path = test_dir
|
| 17 |
+
else:
|
| 18 |
+
raise ValueError("set_name must be either 'train' or 'test'")
|
| 19 |
+
if not os.path.exists(path):
|
| 20 |
+
if set_name=="train":
|
| 21 |
+
df = get_train(with_labels=with_labels)
|
| 22 |
+
else:
|
| 23 |
+
df = get_test(with_labels=with_labels)
|
| 24 |
+
else:
|
| 25 |
+
data_files = glob.glob(os.path.join(path, '*.json'))
|
| 26 |
+
if with_labels:
|
| 27 |
+
truth_path = os.path.join(path, truth_dir, 'task2_gold_d.txt')
|
| 28 |
+
truth_df = pd.read_csv(truth_path).rename(
|
| 29 |
+
columns=lambda s: 'd_' + s if s != 'Subject' else 'subject_id'
|
| 30 |
+
)
|
| 31 |
+
else:
|
| 32 |
+
truth_df = None
|
| 33 |
+
df = load_from_files(data_files, truth=truth_df)
|
| 34 |
+
abc_labels_df = make_task_labels_from_d(df.filter(regex='^d_').values.astype(float))
|
| 35 |
+
df = pd.concat([df, abc_labels_df], axis=1)
|
| 36 |
+
return df
|
| 37 |
+
|
| 38 |
+
def concat_messages(df:pd.DataFrame, sep:str=' | ') -> pd.DataFrame:
|
| 39 |
+
"""
|
| 40 |
+
Concatenate all the messages of a subject into a single message
|
| 41 |
+
"""
|
| 42 |
+
df = (
|
| 43 |
+
df
|
| 44 |
+
.assign(date=lambda x: pd.to_datetime(x['date']))
|
| 45 |
+
.sort_values(['subject_id', 'date'], ascending=[True, True])
|
| 46 |
+
.groupby('subject_id')
|
| 47 |
+
.agg({
|
| 48 |
+
'message': lambda x: sep.join(x),
|
| 49 |
+
'round': 'last',
|
| 50 |
+
**{c: 'first' for c in df.columns.drop(['subject_id', 'message', 'round'])}
|
| 51 |
+
}).sort_index()
|
| 52 |
+
.reset_index()
|
| 53 |
+
)
|
| 54 |
+
return df
|
| 55 |
+
|
| 56 |
+
def load_from_files(files, truth=None):
|
| 57 |
+
"""load all the data into a dataframe"""
|
| 58 |
+
import os, json
|
| 59 |
+
data = []
|
| 60 |
+
for f in files:
|
| 61 |
+
with open(f) as file:
|
| 62 |
+
msgs = json.load(file)
|
| 63 |
+
for msg in msgs:
|
| 64 |
+
data.append([
|
| 65 |
+
msg.get('nick',os.path.basename(f).split('.')[0]),
|
| 66 |
+
msg.get('round', -1),
|
| 67 |
+
msg['id_message'],
|
| 68 |
+
msg['date'],
|
| 69 |
+
msg['message']])
|
| 70 |
+
df = pd.DataFrame(data, columns=['subject_id', 'round', 'id_message', 'date', 'message'])
|
| 71 |
+
if truth is not None:
|
| 72 |
+
df = df.merge(truth, on='subject_id')
|
| 73 |
+
return df
|
| 74 |
+
|
| 75 |
+
def get_train(hf_token:str):
|
| 76 |
+
from datasets import load_dataset, Dataset
|
| 77 |
+
ds = load_dataset('nlpUc3mStudents/mental-risk-d')
|
| 78 |
+
train_df = ds['train'].to_pandas()
|
| 79 |
+
return train_df
|
| 80 |
+
|
| 81 |
+
def get_test(hf_token:str):
|
| 82 |
+
raise NotImplementedError("Test data is not available")
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
task_d_cols = ['suffer+in favour', 'suffer+against', 'suffer+other', 'control']
|
| 86 |
+
|
| 87 |
+
def make_task_labels_from_d(d_data:np.ndarray, include_d:bool=False) -> pd.DataFrame:
|
| 88 |
+
"""
|
| 89 |
+
Get the labels of all other tasks from the labels of the d task
|
| 90 |
+
"""
|
| 91 |
+
if isinstance(d_data, pd.DataFrame):
|
| 92 |
+
d_df = d_data.astype(float)
|
| 93 |
+
else:
|
| 94 |
+
d_df = pd.DataFrame(d_data, columns=task_d_cols).astype(float)
|
| 95 |
+
df = d_df.assign(
|
| 96 |
+
c_label = lambda df: df.iloc[:,:-1].apply(
|
| 97 |
+
lambda x: df.columns[np.argmax(x)] if sum(x)>=0.5 else 'control', axis=1
|
| 98 |
+
),
|
| 99 |
+
a_label=lambda df: (df.c_label!='control').astype(int),
|
| 100 |
+
b_label = lambda df: df[task_d_cols[:-1]].sum(axis=1).round(2)
|
| 101 |
+
)
|
| 102 |
+
if not include_d:
|
| 103 |
+
df = df[['a_label', 'b_label', 'c_label']]
|
| 104 |
+
return df
|
src/embeddings.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Tuple, Dict, Any, Union
|
| 2 |
+
import numpy as np
|
| 3 |
+
from sklearn.base import BaseEstimator, RegressorMixin
|
| 4 |
+
from sklearn.multioutput import MultiOutputRegressor
|
| 5 |
+
from sentence_transformers import SentenceTransformer
|
| 6 |
+
from transformers import AutoTokenizer
|
| 7 |
+
import sklearn
|
| 8 |
+
from sklearn.pipeline import Pipeline
|
| 9 |
+
from sklearn.decomposition import PCA
|
| 10 |
+
from sklearn.preprocessing import StandardScaler
|
| 11 |
+
|
| 12 |
+
from copy import deepcopy
|
| 13 |
+
|
| 14 |
+
from . import utils
|
| 15 |
+
|
| 16 |
+
class EmbeddingsRegressor(BaseEstimator, RegressorMixin):
|
| 17 |
+
|
| 18 |
+
def __init__(
|
| 19 |
+
self,
|
| 20 |
+
encoder: Union[SentenceTransformer, AutoTokenizer],
|
| 21 |
+
regressor: Union[MultiOutputRegressor, BaseEstimator],
|
| 22 |
+
normalize_output: bool = True,
|
| 23 |
+
verbose: bool = False
|
| 24 |
+
):
|
| 25 |
+
self.encoder = encoder
|
| 26 |
+
self.regressor = regressor
|
| 27 |
+
self.normalize_output = normalize_output
|
| 28 |
+
self.encodings = None
|
| 29 |
+
self.verbose = verbose
|
| 30 |
+
|
| 31 |
+
def fit(self, X: List[str], y: List[Tuple[float, float, float, float]]) -> "EmbeddingsRegressor":
|
| 32 |
+
X = self.encoder.encode(X, show_progress_bar=self.verbose)
|
| 33 |
+
self.regressor.fit(X, y)
|
| 34 |
+
return self
|
| 35 |
+
|
| 36 |
+
def transform(self, X: List[str]) -> List[List[float]]:
|
| 37 |
+
X = self.encoder.encode(X, show_progress_bar=self.verbose)
|
| 38 |
+
self.encodings = X
|
| 39 |
+
return X
|
| 40 |
+
|
| 41 |
+
def predict(self, X: Union[List[str], np.array], encodings=False) -> Union[List[float],List[List[float]]]:
|
| 42 |
+
if not encodings:
|
| 43 |
+
X = self.encoder.encode(X, show_progress_bar=self.verbose)
|
| 44 |
+
self.encodings = X
|
| 45 |
+
pred = self.regressor.predict(X)
|
| 46 |
+
if self.normalize_output:
|
| 47 |
+
pred /= pred.sum(axis=1, keepdims=True)
|
| 48 |
+
return pred
|
| 49 |
+
|
src/eval.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, List, Tuple, Any, Callable
|
| 2 |
+
from dataclasses import dataclass
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import numpy as np
|
| 5 |
+
from sklearn.metrics import (
|
| 6 |
+
f1_score, accuracy_score, recall_score, confusion_matrix,
|
| 7 |
+
classification_report,
|
| 8 |
+
r2_score, mean_squared_error
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@dataclass
|
| 13 |
+
class ClassificationScores:
|
| 14 |
+
precision: float
|
| 15 |
+
recall: float
|
| 16 |
+
f1: float
|
| 17 |
+
support: float = None
|
| 18 |
+
|
| 19 |
+
@classmethod
|
| 20 |
+
def from_dict(cls, d:Dict[str, float]) -> "ClassificationScores":
|
| 21 |
+
d = {k.split('-')[0]: v for k, v in d.items() if k.split('-')[0] in cls.__annotations__}
|
| 22 |
+
return cls(**d)
|
| 23 |
+
|
| 24 |
+
@dataclass
|
| 25 |
+
class RegressionScores:
|
| 26 |
+
r2: float
|
| 27 |
+
mse: float
|
| 28 |
+
rmse: float
|
| 29 |
+
|
| 30 |
+
@classmethod
|
| 31 |
+
def make(cls, true:np.ndarray, pred:np.ndarray) -> "RegressionScores":
|
| 32 |
+
return cls(
|
| 33 |
+
r2=r2_score(true, pred),
|
| 34 |
+
mse=mean_squared_error(true, pred),
|
| 35 |
+
rmse=mean_squared_error(true, pred, squared=False)
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
def __add__(self, other):
|
| 39 |
+
return RegressionScores(
|
| 40 |
+
r2=self.r2 + other.r2,
|
| 41 |
+
mse=self.mse + other.mse,
|
| 42 |
+
rmse=self.rmse + other.rmse
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
def __truediv__(self, other):
|
| 46 |
+
return RegressionScores(
|
| 47 |
+
r2=self.r2 / other,
|
| 48 |
+
mse=self.mse / other,
|
| 49 |
+
rmse=self.rmse / other
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@dataclass
|
| 54 |
+
class ClassificationReport:
|
| 55 |
+
accuracy: float
|
| 56 |
+
confusion: np.ndarray
|
| 57 |
+
macro: ClassificationScores
|
| 58 |
+
weighted: ClassificationScores
|
| 59 |
+
labels: list
|
| 60 |
+
label_scores: Dict[str, ClassificationScores] # label -> ClassificationScores
|
| 61 |
+
|
| 62 |
+
f1: float = None # only for binary classification
|
| 63 |
+
recall: float = None # only for binary classification
|
| 64 |
+
|
| 65 |
+
@classmethod
|
| 66 |
+
def make_report(cls, true:np.ndarray, pred:np.ndarray) -> "ClassificationReport":
|
| 67 |
+
class_labels = np.unique(np.concatenate([true, pred]))
|
| 68 |
+
report = classification_report(true, pred, labels=class_labels, output_dict=True, zero_division=0)
|
| 69 |
+
rep = cls(
|
| 70 |
+
accuracy=report.pop('accuracy'),
|
| 71 |
+
confusion=confusion_matrix(true, pred, labels=class_labels),
|
| 72 |
+
macro=ClassificationScores.from_dict(report.pop('macro avg')),
|
| 73 |
+
weighted=ClassificationScores.from_dict(report.pop('weighted avg')),
|
| 74 |
+
label_scores={label: ClassificationScores.from_dict(scores) for label, scores in report.items()},
|
| 75 |
+
labels=list(class_labels)
|
| 76 |
+
)
|
| 77 |
+
if len(class_labels) == 2:
|
| 78 |
+
rep.f1 = f1_score(true, pred)
|
| 79 |
+
rep.recall = recall_score(true, pred)
|
| 80 |
+
return rep
|
| 81 |
+
|
| 82 |
+
@property
|
| 83 |
+
def df(self):
|
| 84 |
+
df_dict = {
|
| 85 |
+
'Accuracy': self.accuracy,
|
| 86 |
+
**{f'{score.title()} (macro)': getattr(self.macro, score) for score in self.macro.__annotations__ if score != 'support'},
|
| 87 |
+
}
|
| 88 |
+
df = pd.DataFrame([df_dict])
|
| 89 |
+
return df
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
@dataclass
|
| 95 |
+
class RegressionReport:
|
| 96 |
+
r2: float
|
| 97 |
+
rmse: float
|
| 98 |
+
labels: list = None # only for multivariate regression
|
| 99 |
+
label_scores: Dict[str, float] = None # only for multivariate regression
|
| 100 |
+
|
| 101 |
+
@classmethod
|
| 102 |
+
def make_report(cls, true:np.ndarray, pred:np.ndarray, labels=None) -> "RegressionReport":
|
| 103 |
+
report = cls(
|
| 104 |
+
r2=r2_score(true, pred),
|
| 105 |
+
rmse=mean_squared_error(true, pred, squared=False)
|
| 106 |
+
)
|
| 107 |
+
if len(true.shape) > 1 and true.shape[1] > 1:
|
| 108 |
+
report.labels = labels or list(range(true.shape[1]))
|
| 109 |
+
report.label_scores = {label: RegressionScores.make(true[:,i], pred[:,i]) for i,label in enumerate(report.labels)}
|
| 110 |
+
return report
|
| 111 |
+
|
| 112 |
+
@property
|
| 113 |
+
def is_multivariate(self):
|
| 114 |
+
return self.labels is not None
|
| 115 |
+
|
| 116 |
+
@property
|
| 117 |
+
def df(self):
|
| 118 |
+
df_dict = {
|
| 119 |
+
'R2 avg': self.r2,
|
| 120 |
+
'RMSE avg': self.rmse,
|
| 121 |
+
}
|
| 122 |
+
if self.is_multivariate:
|
| 123 |
+
df_dict.update({f'R2 {label}': scores.r2 for label, scores in self.label_scores.items()})
|
| 124 |
+
df_dict.update({f'RMSE {label}': scores.rmse for label, scores in self.label_scores.items()})
|
| 125 |
+
df = pd.DataFrame([df_dict])
|
| 126 |
+
rmse_cols = ['RMSE avg']
|
| 127 |
+
df = df.filter(items=['RMSE avg', 'Pearson avg'] + sorted(df.columns.difference(['Pearson avg', 'RMSE avg'])))
|
| 128 |
+
df.columns = df.columns.str.replace('\s(a|b|c|d)_', ' ', regex=True)
|
| 129 |
+
return df
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
@dataclass
|
| 133 |
+
class Results:
|
| 134 |
+
taska: ClassificationReport
|
| 135 |
+
taskb: RegressionReport
|
| 136 |
+
taskc: ClassificationReport
|
| 137 |
+
taskd: RegressionReport
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def absolute_results(true_df:pd.DataFrame, pred_df:pd.DataFrame, tasks='abcd'):
|
| 141 |
+
task_reports = {}
|
| 142 |
+
for task in tasks:
|
| 143 |
+
true=true_df.filter(regex=f'^{task}_').sort_index(axis=1)
|
| 144 |
+
pred=pred_df.filter(regex=f'^{task}_').sort_index(axis=1)
|
| 145 |
+
if len(true.columns) == 0 or len(pred.columns) == 0:
|
| 146 |
+
task_reports['task'+task] = None
|
| 147 |
+
continue
|
| 148 |
+
if task in ['a', 'c']:
|
| 149 |
+
task_reports['task'+task] = ClassificationReport.make_report(
|
| 150 |
+
true=true.iloc[:,0].values,
|
| 151 |
+
pred=pred.iloc[:,0].values
|
| 152 |
+
)
|
| 153 |
+
else:
|
| 154 |
+
task_reports['task'+task] = RegressionReport.make_report(
|
| 155 |
+
true=true.values,
|
| 156 |
+
pred=pred.values,
|
| 157 |
+
labels=true.columns.tolist() if task == 'd' else None
|
| 158 |
+
)
|
| 159 |
+
return Results(**task_reports)
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def estimators_eval(estimators:List[Tuple[str,Any]], score_func:Callable[[np.ndarray, np.ndarray], float]):
|
| 164 |
+
def fit_eval_estimators(X_train:np.ndarray, y_train:np.ndarray, X_test:np.ndarray, y_test:np.ndarray) -> dict:
|
| 165 |
+
estimator_scores = {}
|
| 166 |
+
for name, estimator in estimators:
|
| 167 |
+
estimator.fit(X_train, y_train)
|
| 168 |
+
y_pred = estimator.predict(X_test)
|
| 169 |
+
score = score_func(y_test, y_pred)#*(1.4*((y_train>th).sum()/(len(y_train)-1))) # weighted for class imbalance
|
| 170 |
+
print(f"\"{name}\" estimator score: {score:.4f}")
|
| 171 |
+
estimator_scores[name] = score
|
| 172 |
+
return estimator_scores
|
| 173 |
+
return fit_eval_estimators
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def label_metrics(score_fun, y_true, y_pred):
|
| 177 |
+
if len(y_true.shape) > 1 and y_true.shape[1] > 1:
|
| 178 |
+
scores = []
|
| 179 |
+
for i in range(y_true.shape[1]):
|
| 180 |
+
scores.append(score_fun(y_true[:,i],y_pred[:,i]))
|
| 181 |
+
return scores
|
| 182 |
+
score = score_fun(y_true.ravel(), y_pred.ravel())
|
| 183 |
+
if isinstance(score, list):
|
| 184 |
+
return score
|
| 185 |
+
elif isinstance(score, np.ndarray):
|
| 186 |
+
return score.tolist()
|
| 187 |
+
else:
|
| 188 |
+
return [score]
|
| 189 |
+
|
| 190 |
+
def metrics_for_estimators(estimators, score_fun, X, y_true):
|
| 191 |
+
metrics = {}
|
| 192 |
+
for name, estimator in estimators:
|
| 193 |
+
y_pred = estimator.predict(X)
|
| 194 |
+
metrics[name] = label_metrics(score_fun, y_true, y_pred)
|
| 195 |
+
return metrics
|
src/models.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .roberta_regressor import RobertaRegressor
|
| 2 |
+
from .embeddings import EmbeddingsRegressor
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class EmbeddingsSimpleRegressor
|
src/multiregression.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Tuple, Dict, Any, Union
|
| 2 |
+
from copy import deepcopy
|
| 3 |
+
|
| 4 |
+
import numpy as np
|
| 5 |
+
import sklearn
|
| 6 |
+
from sklearn.base import BaseEstimator, RegressorMixin
|
| 7 |
+
from sklearn.multioutput import MultiOutputRegressor
|
| 8 |
+
from sklearn.pipeline import Pipeline
|
| 9 |
+
from sklearn.decomposition import PCA
|
| 10 |
+
from sklearn.preprocessing import StandardScaler
|
| 11 |
+
|
| 12 |
+
from . import utils
|
| 13 |
+
|
| 14 |
+
class RegChainWithPCA(BaseEstimator, RegressorMixin):
|
| 15 |
+
|
| 16 |
+
def __init__(
|
| 17 |
+
self,
|
| 18 |
+
base_regressor:sklearn.base.BaseEstimator,
|
| 19 |
+
num_components:float=0.97,
|
| 20 |
+
pca_exclude_first:bool=True,
|
| 21 |
+
**fit_params):
|
| 22 |
+
"""
|
| 23 |
+
This chain works like sklearn.multioutput.RegressorChain,
|
| 24 |
+
but applies PCA to reduce the dimensionality of the input data of the chain.
|
| 25 |
+
By default, the first target is excluded from the PCA transformation.
|
| 26 |
+
That is, it is fitted with the original input data while the rest of the targets
|
| 27 |
+
are fitted with the PCA-transformed input data.
|
| 28 |
+
|
| 29 |
+
Parameters
|
| 30 |
+
----------
|
| 31 |
+
|
| 32 |
+
base_regressor : sklearn.base.BaseEstimator
|
| 33 |
+
The base regressor to be used in the chain.
|
| 34 |
+
num_components : float, optional
|
| 35 |
+
The number of components to keep in the PCA transformation.
|
| 36 |
+
If float, it is the ratio of variance to be kept.
|
| 37 |
+
If int, it is the number of components to keep.
|
| 38 |
+
The default is 0.97.
|
| 39 |
+
pca_exclude_first : bool, optional
|
| 40 |
+
If True the first target is excluded from the PCA transformation.
|
| 41 |
+
If False all targets including the first are fitted with the PCA-transformed input data.
|
| 42 |
+
The default is True.
|
| 43 |
+
**fit_params :
|
| 44 |
+
Additional parameters to be passed to the fit method of the base regressor.
|
| 45 |
+
"""
|
| 46 |
+
self.base_regressor = base_regressor
|
| 47 |
+
self.num_components = num_components
|
| 48 |
+
self.pca_exclude_first = pca_exclude_first
|
| 49 |
+
self.estimators = None
|
| 50 |
+
self.pipes = None
|
| 51 |
+
self.fit_params = fit_params
|
| 52 |
+
|
| 53 |
+
def fit_pipe(self, X, num_components=None):
|
| 54 |
+
if num_components is None:
|
| 55 |
+
num_components = self.num_components
|
| 56 |
+
pipe = Pipeline([
|
| 57 |
+
('scaler', StandardScaler()),
|
| 58 |
+
('pca', PCA(n_components=self.num_components)),
|
| 59 |
+
])
|
| 60 |
+
pipe.fit(X)
|
| 61 |
+
self.pipe = pipe
|
| 62 |
+
return pipe
|
| 63 |
+
|
| 64 |
+
def fit(self, X, y, **fit_params):
|
| 65 |
+
fit_params_ = self.fit_params.copy()
|
| 66 |
+
fit_params_.update(fit_params)
|
| 67 |
+
pipe = self.fit_pipe(X)
|
| 68 |
+
Y_pred_chain = np.zeros((X.shape[0], y.shape[1]))
|
| 69 |
+
X_transformed = pipe.transform(X)
|
| 70 |
+
num_components_pca = X_transformed.shape[1]
|
| 71 |
+
X_aug = np.hstack((X_transformed, Y_pred_chain))
|
| 72 |
+
self.estimators = [deepcopy(self.base_regressor) for _ in range(y.shape[1])]
|
| 73 |
+
del Y_pred_chain, X_transformed
|
| 74 |
+
for idx, estimator in enumerate(self.estimators):
|
| 75 |
+
if idx == 0 and self.pca_exclude_first:
|
| 76 |
+
estimator.fit(X, y[:, idx], **fit_params_)
|
| 77 |
+
else:
|
| 78 |
+
estimator.fit(X_aug[:, : (num_components_pca + idx)], y[:, idx], **fit_params_)
|
| 79 |
+
if idx < y.shape[1] - 1:
|
| 80 |
+
if idx == 0 and self.pca_exclude_first:
|
| 81 |
+
X_aug[:, num_components_pca + idx] = estimator.predict(X)
|
| 82 |
+
else:
|
| 83 |
+
X_aug[:, num_components_pca + idx] = estimator.predict(X_aug[:, : (num_components_pca + idx)])
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def predict(self, X):
|
| 87 |
+
Y_pred_chain = np.zeros((X.shape[0], len(self.estimators)))
|
| 88 |
+
X_transformed = self.pipe.transform(X)
|
| 89 |
+
X_aug = np.hstack((X_transformed, Y_pred_chain))
|
| 90 |
+
for idx, estimator in enumerate(self.estimators):
|
| 91 |
+
if idx == 0 and self.pca_exclude_first:
|
| 92 |
+
Y_pred_chain[:, idx] = estimator.predict(X)
|
| 93 |
+
else:
|
| 94 |
+
Y_pred_chain[:, idx] = estimator.predict(X_aug[:, : (X_transformed.shape[1] + idx)])
|
| 95 |
+
if idx < len(self.estimators) - 1:
|
| 96 |
+
X_aug[:, X_transformed.shape[1] + idx] = Y_pred_chain[:, idx]
|
| 97 |
+
return Y_pred_chain
|
| 98 |
+
|
| 99 |
+
def score(self, X, y):
|
| 100 |
+
return utils.comp_score(y, self.predict(X))
|
| 101 |
+
|
| 102 |
+
def get_params(self, deep=True):
|
| 103 |
+
return {
|
| 104 |
+
'base_regressor': self.base_regressor,
|
| 105 |
+
'num_components': self.num_components,
|
| 106 |
+
'pca_exclude_first': self.pca_exclude_first,
|
| 107 |
+
**self.fit_params
|
| 108 |
+
}
|
src/roberta_regressor.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Defines a wrapper class of RobertaPreTrainedModel model to do regression on text data.
|
| 3 |
+
Based on: https://www.kaggle.com/code/sumantindurkhya/bert-for-regression
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import Optional, Tuple, Union
|
| 7 |
+
from tqdm import tqdm, trange
|
| 8 |
+
|
| 9 |
+
import numpy as np
|
| 10 |
+
import torch
|
| 11 |
+
import torch.nn.functional as F
|
| 12 |
+
import torch.utils.checkpoint
|
| 13 |
+
from torch import nn
|
| 14 |
+
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
| 15 |
+
|
| 16 |
+
from transformers import BertModel, BertPreTrainedModel, RobertaPreTrainedModel, RobertaModel
|
| 17 |
+
|
| 18 |
+
class RobertaRegressor(RobertaPreTrainedModel):
|
| 19 |
+
|
| 20 |
+
def __init__(self, config, num_outputs=1, dropout=0.1, freeze_bert=False):
|
| 21 |
+
super().__init__(config)
|
| 22 |
+
|
| 23 |
+
self.num_outputs = num_outputs
|
| 24 |
+
|
| 25 |
+
self.roberta = RobertaModel(config)
|
| 26 |
+
if freeze_bert:
|
| 27 |
+
# freeze the roberta parameters
|
| 28 |
+
for param in self.roberta.parameters():
|
| 29 |
+
param.requires_grad = False
|
| 30 |
+
self.classifier = nn.Linear(config.hidden_size, 128)
|
| 31 |
+
self.relu = nn.ReLU()
|
| 32 |
+
self.dropout = nn.Dropout(dropout)
|
| 33 |
+
self.tanh = nn.Tanh()
|
| 34 |
+
self.regressor = nn.Linear(128, num_outputs)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def forward(self, input_ids, attention_mask):
|
| 38 |
+
# forward pass of the model
|
| 39 |
+
base_out = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
|
| 40 |
+
logits = base_out.pooler_output
|
| 41 |
+
out = self.classifier(logits)
|
| 42 |
+
out = self.dropout(out)
|
| 43 |
+
out = self.relu(out)
|
| 44 |
+
out = self.tanh(out)
|
| 45 |
+
out = self.dropout(out)
|
| 46 |
+
out = self.regressor(out)
|
| 47 |
+
return out
|
| 48 |
+
|
| 49 |
+
def predict(self, text:str, tokenizer, device, numpy=True) -> Tuple[float, float, float, float]:
|
| 50 |
+
input_ids, attention_mask = tokenizer.encode_plus(text, padding=True, truncation=True, return_tensors='pt').values()
|
| 51 |
+
input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
|
| 52 |
+
output = self(input_ids, attention_mask).squeeze()
|
| 53 |
+
# free up memory
|
| 54 |
+
del input_ids, attention_mask
|
| 55 |
+
out = output.detach()
|
| 56 |
+
if numpy:
|
| 57 |
+
return out.cpu().numpy()
|
| 58 |
+
return out
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class RobertaSeqMultiRegressor(RobertaPreTrainedModel):
|
| 62 |
+
"""
|
| 63 |
+
A wrapper class of RobertaPreTrainedModel model to do multi-output regression on text data.
|
| 64 |
+
This models the task of predicting multiple outputs from a single text input.
|
| 65 |
+
The problem is formulated in a sequential manner, where the model predicts the next output
|
| 66 |
+
conditioned on the previous outputs.
|
| 67 |
+
|
| 68 |
+
This approach is ideal for modeling problems where the outputs are correlated
|
| 69 |
+
such as probability distributions, where the sum of the outputs must be 1.
|
| 70 |
+
Or, for example, in the case of predicting the next word in a sentence, where the
|
| 71 |
+
model must predict the next word conditioned on the previous words.
|
| 72 |
+
|
| 73 |
+
The model is similar to the one described in the RobertaRegressor class, with the
|
| 74 |
+
exception that the head of the model is a sequential model, where the output of the
|
| 75 |
+
previous layer is fed as input to the next layer similar to how a RNN works.
|
| 76 |
+
"""
|
| 77 |
+
|
| 78 |
+
def __init__(self, config, num_outputs=1, dropout=0.1, freeze_bert=False):
|
| 79 |
+
super().__init__(config)
|
| 80 |
+
|
| 81 |
+
self.num_outputs = num_outputs
|
| 82 |
+
|
| 83 |
+
self.roberta = RobertaModel(config)
|
| 84 |
+
if freeze_bert:
|
| 85 |
+
# freeze the roberta parameters
|
| 86 |
+
for param in self.roberta.parameters():
|
| 87 |
+
param.requires_grad = False
|
| 88 |
+
# head of the model is a model that takes the output of the previous layer as input
|
| 89 |
+
# and outputs a single value until the number of outputs is reached
|
| 90 |
+
for i in range(num_outputs):
|
| 91 |
+
setattr(self, f"regressor_{i}", nn.Linear(config.hidden_size, 128))
|
| 92 |
+
self.relu = nn.ReLU()
|
| 93 |
+
self.dropout = nn.Dropout(dropout)
|
| 94 |
+
self.tanh = nn.Tanh()
|
| 95 |
+
|
| 96 |
+
def forward(self, input_ids, attention_mask):
|
| 97 |
+
# forward pass of the model
|
| 98 |
+
base_out = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
|
| 99 |
+
logits = base_out.pooler_output
|
| 100 |
+
outputs = []
|
| 101 |
+
for i in range(self.num_outputs):
|
| 102 |
+
out = getattr(self, f"regressor_{i}")(logits)
|
| 103 |
+
out = self.dropout(out)
|
| 104 |
+
out = self.relu(out)
|
| 105 |
+
out = self.tanh(out)
|
| 106 |
+
outputs.append(out)
|
| 107 |
+
return outputs
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def sum_diff_loss(output, target):
|
| 111 |
+
return torch.sum(torch.abs(output - target))
|
| 112 |
+
|
| 113 |
+
def evaluate(model, criterion, dataloader, device, sum_diff_penalty=False):
|
| 114 |
+
model.eval()
|
| 115 |
+
mean_acc, mean_loss, count = 0, 0, 0
|
| 116 |
+
|
| 117 |
+
with torch.no_grad():
|
| 118 |
+
for input_ids, attention_mask, target in (dataloader):
|
| 119 |
+
|
| 120 |
+
input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
|
| 121 |
+
output = model(input_ids, attention_mask)
|
| 122 |
+
|
| 123 |
+
mean_loss += criterion(output.squeeze(), target.type_as(output)).item()
|
| 124 |
+
count += 1
|
| 125 |
+
|
| 126 |
+
return mean_loss/count
|
| 127 |
+
|
| 128 |
+
# def predict(model, dataloader, device):
|
| 129 |
+
# predicted_label = []
|
| 130 |
+
# actual_label = []
|
| 131 |
+
# with torch.no_grad():
|
| 132 |
+
# for input_ids, attention_mask, target in (dataloader):
|
| 133 |
+
|
| 134 |
+
# input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
|
| 135 |
+
# output = model(input_ids, attention_mask)
|
| 136 |
+
|
| 137 |
+
# predicted_label += output
|
| 138 |
+
# actual_label += target
|
| 139 |
+
|
| 140 |
+
# return predicted_label
|
| 141 |
+
|
| 142 |
+
def train(model, criterion, optimizer, train_loader, val_loader, epochs, device):
|
| 143 |
+
best_acc = 0
|
| 144 |
+
for epoch in trange(epochs, desc="Epoch"):
|
| 145 |
+
model.train()
|
| 146 |
+
train_loss = 0
|
| 147 |
+
for i, (input_ids, attention_mask, target) in enumerate(iterable=train_loader):
|
| 148 |
+
optimizer.zero_grad()
|
| 149 |
+
|
| 150 |
+
input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
|
| 151 |
+
|
| 152 |
+
output = model(input_ids=input_ids, attention_mask=attention_mask)
|
| 153 |
+
# out = model.classifier(output)
|
| 154 |
+
loss = criterion(output.squeeze(), target.type_as(output))
|
| 155 |
+
loss.backward()
|
| 156 |
+
optimizer.step()
|
| 157 |
+
|
| 158 |
+
train_loss += loss.item()
|
| 159 |
+
|
| 160 |
+
print(f"Training loss is {train_loss/len(train_loader)}")
|
| 161 |
+
val_loss = evaluate(model=model, criterion=criterion, dataloader=val_loader, device=device)
|
| 162 |
+
print("Epoch {} complete! Validation Loss : {}".format(epoch, val_loss))
|
| 163 |
+
|
| 164 |
+
def multi_reg_loss(loss='mse', sum_diff_penalty:float=0.0):
|
| 165 |
+
"""
|
| 166 |
+
A custom loss function that penalizes the sum of differences
|
| 167 |
+
between the predicted and actual values for multi-output regression.
|
| 168 |
+
This is done to guide the model to predict outputs where
|
| 169 |
+
sum(y_hat1, y_hat2, ...) = sum(y1, y2, ...)
|
| 170 |
+
|
| 171 |
+
e.g: in task d, we have that sum(label1, label2, label3, label4) = 1
|
| 172 |
+
since its a probability distribution.
|
| 173 |
+
|
| 174 |
+
Parameters
|
| 175 |
+
----------
|
| 176 |
+
loss : str, optional
|
| 177 |
+
The loss function to be used, by default 'mse'
|
| 178 |
+
Available options: 'mse' and 'cross_entropy'
|
| 179 |
+
for mean squared error and cross entropy loss respectively
|
| 180 |
+
sum_diff_penalty : float, optional
|
| 181 |
+
The penalty to be applied to the sum of differences between the predicted and actual values, by default 0.0 (no penalty)
|
| 182 |
+
"""
|
| 183 |
+
if loss == 'mse':
|
| 184 |
+
loss_func = F.mse_loss
|
| 185 |
+
elif loss == 'cross_entropy':
|
| 186 |
+
loss_func = F.cross_entropy
|
| 187 |
+
else:
|
| 188 |
+
raise ValueError("Invalid loss function. Available options: 'mse' and 'cross_entropy'")
|
| 189 |
+
def reg_loss(input, target):
|
| 190 |
+
# first compute the normal MSE loss
|
| 191 |
+
mse = loss_func(input, target)
|
| 192 |
+
# then penalize the sum of differences between the predicted and actual values
|
| 193 |
+
sum_diff = torch.square(torch.sum(input) - torch.sum(target))
|
| 194 |
+
return mse + sum_diff_penalty*sum_diff
|
| 195 |
+
return reg_loss
|
| 196 |
+
|
src/train.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
# Embeddings
|
| 4 |
+
from sentence_transformers import SentenceTransformer
|
| 5 |
+
|
| 6 |
+
# train a classifier on the embeddings for multiclass regression
|
| 7 |
+
from sklearn.model_selection import train_test_split
|
| 8 |
+
from sklearn.metrics import (
|
| 9 |
+
r2_score, mean_squared_error, # regression metrics
|
| 10 |
+
accuracy_score, f1_score, precision_score, recall_score # classification metrics
|
| 11 |
+
)
|
| 12 |
+
from sklearn.multioutput import MultiOutputRegressor, RegressorChain # for multiclass regression
|
| 13 |
+
|
| 14 |
+
# Estimators
|
| 15 |
+
from sklearn.ensemble import (
|
| 16 |
+
RandomForestRegressor,
|
| 17 |
+
RandomForestClassifier,
|
| 18 |
+
GradientBoostingRegressor,
|
| 19 |
+
GradientBoostingClassifier,
|
| 20 |
+
AdaBoostRegressor,
|
| 21 |
+
AdaBoostClassifier
|
| 22 |
+
)
|
| 23 |
+
from sklearn.linear_model import (
|
| 24 |
+
LinearRegression,
|
| 25 |
+
LogisticRegression,
|
| 26 |
+
Ridge,
|
| 27 |
+
Lasso
|
| 28 |
+
)
|
| 29 |
+
# other regressors
|
| 30 |
+
from sklearn.svm import SVR
|
| 31 |
+
from sklearn.neighbors import KNeighborsRegressor
|
| 32 |
+
from sklearn.neural_network import MLPRegressor
|
| 33 |
+
from sklearn.tree import DecisionTreeRegressor
|
| 34 |
+
|
| 35 |
+
from lightgbm import LGBMRegressor, LGBMClassifier
|
| 36 |
+
|
| 37 |
+
# type hinting
|
| 38 |
+
import os, json
|
| 39 |
+
from typing import List, Callable, Dict, Tuple, Any
|
| 40 |
+
|
| 41 |
+
# local imports
|
| 42 |
+
from src import data, utils
|
| 43 |
+
from src.embeddings import EmbeddingsRegressor
|
| 44 |
+
|
| 45 |
+
def comp_score(y_true:np.ndarray,y_pred:np.ndarray)->float:
|
| 46 |
+
"""
|
| 47 |
+
Metric for multiclass regression. Computes the average of the RMSE scores for each label.
|
| 48 |
+
"""
|
| 49 |
+
rmse_scores = []
|
| 50 |
+
for i in range(y_true.shape[1]):
|
| 51 |
+
rmse_scores.append(np.sqrt(mean_squared_error(y_true[:,i],y_pred[:,i])))
|
| 52 |
+
return np.mean(rmse_scores)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def estimators_eval(estimators:List[Tuple[str,Any]], score_func:Callable[[np.ndarray, np.ndarray], float]):
|
| 56 |
+
def fit_eval_estimators(X_train:np.ndarray, y_train:np.ndarray, X_test:np.ndarray, y_test:np.ndarray) -> dict:
|
| 57 |
+
estimator_scores = {}
|
| 58 |
+
for name, estimator in estimators:
|
| 59 |
+
estimator.fit(X_train, y_train)
|
| 60 |
+
y_pred = estimator.predict(X_test)
|
| 61 |
+
score = score_func(y_test, y_pred)#*(1.4*((y_train>th).sum()/(len(y_train)-1))) # weighted for class imbalance
|
| 62 |
+
print(f"\"{name}\" estimator score: {score:.4f}")
|
| 63 |
+
estimator_scores[name] = score
|
| 64 |
+
return estimator_scores
|
| 65 |
+
return fit_eval_estimators
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def get_data():
|
| 69 |
+
# load the train and test data
|
| 70 |
+
train_data = data.load('train')
|
| 71 |
+
test_df = data.load('test')
|
| 72 |
+
# concat messages by subject id
|
| 73 |
+
train_data = data.concat_messages(train_data)
|
| 74 |
+
test_df = data.concat_messages(test_df)
|
| 75 |
+
|
| 76 |
+
# split into 15% of subject ids for validation
|
| 77 |
+
# get the classes as the argmax of the label probabilities to use them for stratification
|
| 78 |
+
subj_classes = train_data.set_index('subject_id').filter(regex='^d_')\
|
| 79 |
+
.apply(lambda x: x.argmax() if x[:-1].sum()<0.5 else x[:-1].argmax(), axis=1)\
|
| 80 |
+
.replace(dict(enumerate(train_data.filter(regex='^d_').columns)))
|
| 81 |
+
tr_subj_ids, val_subj_ids = train_test_split(subj_classes.index, test_size=0.15, random_state=42, stratify=subj_classes.values)
|
| 82 |
+
# split the train data into train and validation sets
|
| 83 |
+
val_df = train_data[train_data['subject_id'].isin(val_subj_ids)]
|
| 84 |
+
train_df = train_data[train_data['subject_id'].isin(tr_subj_ids)]
|
| 85 |
+
|
| 86 |
+
# augment the train data by taking only the first half of the messages
|
| 87 |
+
half_messages_df_train = train_df.assign(
|
| 88 |
+
message=lambda df: df['message'].apply(lambda x: ' | '.join(x.split(' | ')[:len(x.split(' | '))//2])),
|
| 89 |
+
# num_messages=lambda df: df['message'].apply(lambda x: len(x.split(' | ')))
|
| 90 |
+
)
|
| 91 |
+
train_df = pd.concat([train_df, half_messages_df_train], axis=0).sort_values('subject_id').reset_index(drop=True)
|
| 92 |
+
return train_df, val_df, test_df
|
src/utils.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Tuple
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pandas as pd
|
| 4 |
+
|
| 5 |
+
def print_messages(msgs:List[dict]):
|
| 6 |
+
"""
|
| 7 |
+
Print the messages of a subject
|
| 8 |
+
|
| 9 |
+
Messages are a list of dictionaries of the form: [{'id_message': {int_id}, 'message': '{str_message}', 'date': '{str_date}'}, ...]
|
| 10 |
+
and are attached to an specific subject.
|
| 11 |
+
"""
|
| 12 |
+
for message in msgs:
|
| 13 |
+
print(f"{message['date']} - {message['message']}")
|
| 14 |
+
|
| 15 |
+
def load_data(files, truth):
|
| 16 |
+
"""load all the data into a dataframe"""
|
| 17 |
+
import os, json
|
| 18 |
+
data = []
|
| 19 |
+
for f in files:
|
| 20 |
+
with open(f) as file:
|
| 21 |
+
msgs = json.load(file)
|
| 22 |
+
for msg in msgs:
|
| 23 |
+
data.append([os.path.basename(f).split('.')[0], msg['id_message'], msg['date'], msg['message']])
|
| 24 |
+
df = pd.DataFrame(data, columns=['subject_id', 'id_message', 'date', 'message'])
|
| 25 |
+
df = df.merge(truth, on='subject_id')
|
| 26 |
+
return df
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def normalize(x, prob=True):
|
| 30 |
+
"""
|
| 31 |
+
Normalize a vector to [0,1] and sum 1 if prob=True
|
| 32 |
+
"""
|
| 33 |
+
x = x.reshape(-1,4)
|
| 34 |
+
# normalize to [0,1]
|
| 35 |
+
x = ((x - x.min(axis=1)[...,None])/(x.max(axis=1)[...,None] - x.min(axis=1)[...,None])).round(4)
|
| 36 |
+
if prob:
|
| 37 |
+
# normalize to sum 1
|
| 38 |
+
x = x/x.sum(axis=1)[...,None]
|
| 39 |
+
return x.round(4)
|
| 40 |
+
|
| 41 |
+
def label_metrics(score_fun, y_true, y_pred):
|
| 42 |
+
scores = []
|
| 43 |
+
for i in range(y_true.shape[1]):
|
| 44 |
+
scores.append(score_fun(y_true[:,i],y_pred[:,i]))
|
| 45 |
+
return scores
|
| 46 |
+
|
| 47 |
+
def make_predict(predict_fn, **kwargs):
|
| 48 |
+
def predict(msg):
|
| 49 |
+
pred = predict_fn(msg, **kwargs)
|
| 50 |
+
return pred
|
| 51 |
+
return predict
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def comp_score(y_true:np.ndarray,y_pred:np.ndarray)->float:
|
| 55 |
+
"""
|
| 56 |
+
Metric for simple and multiclass regression. Computes the average of the RMSE scores for each label.
|
| 57 |
+
"""
|
| 58 |
+
from sklearn.metrics import mean_squared_error
|
| 59 |
+
rmse_scores = []
|
| 60 |
+
for i in range(y_true.shape[1]):
|
| 61 |
+
rmse_scores.append(np.sqrt(mean_squared_error(y_true[:,i],y_pred[:,i])))
|
| 62 |
+
return np.mean(rmse_scores)
|