Spaces:
Sleeping
Sleeping
made simple functional streamlit app to host the model
Browse files- app.py +38 -0
- models/2d_ridge_roberta-suicide-regchain-pca-final.pkl +0 -0
- requirements.txt +5 -0
- src/__init__.py +0 -0
- src/__pycache__/__init__.cpython-38.pyc +0 -0
- src/__pycache__/class_eval.cpython-38.pyc +0 -0
- src/__pycache__/data.cpython-38.pyc +0 -0
- src/__pycache__/embeddings.cpython-38.pyc +0 -0
- src/__pycache__/eval.cpython-38.pyc +0 -0
- src/__pycache__/multiregression.cpython-38.pyc +0 -0
- src/__pycache__/roberta_regressor.cpython-38.pyc +0 -0
- src/__pycache__/utils.cpython-38.pyc +0 -0
- src/berta_finetuning.py +28 -0
- src/class_eval.py +576 -0
- src/data.py +104 -0
- src/embeddings.py +49 -0
- src/eval.py +195 -0
- src/models.py +5 -0
- src/multiregression.py +108 -0
- src/roberta_regressor.py +196 -0
- src/train.py +92 -0
- src/utils.py +62 -0
app.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pickle
|
3 |
+
import numpy as np
|
4 |
+
import os, glob, json, sys
|
5 |
+
import pickle
|
6 |
+
import pandas as pd
|
7 |
+
import numpy as np
|
8 |
+
from sentence_transformers import SentenceTransformer
|
9 |
+
|
10 |
+
from src import data, utils
|
11 |
+
from src.embeddings import EmbeddingsRegressor
|
12 |
+
|
13 |
+
|
14 |
+
# load the models
|
15 |
+
with open('models/2d_ridge_roberta-suicide-regchain-pca-final.pkl', 'rb') as f:
|
16 |
+
regressor = pickle.load(f)
|
17 |
+
|
18 |
+
model_name = 'hackathon-somos-nlp-2023/roberta-base-bne-finetuned-suicide-es'
|
19 |
+
tokenizer = SentenceTransformer(model_name)
|
20 |
+
model = EmbeddingsRegressor(tokenizer, regressor, normalize_output=True)
|
21 |
+
predict = utils.make_predict(model.predict)
|
22 |
+
|
23 |
+
# model_selector = st.sidebar.selectbox(
|
24 |
+
# 'Select model:',
|
25 |
+
# ['roberta', 'roberta_seq_multi', 'roberta_seq_multi_2']
|
26 |
+
# )
|
27 |
+
|
28 |
+
text_input = st.text_input('Enter your text here:')
|
29 |
+
if text_input:
|
30 |
+
prediction = predict([text_input]).tolist()
|
31 |
+
prediction = np.array(prediction).reshape(-1,4)
|
32 |
+
prediction = utils.normalize(prediction)
|
33 |
+
preds_df = data.make_task_labels_from_d(prediction, include_d=True).rename(
|
34 |
+
columns={c:'d_'+c.replace('+','_').replace('|','_') for c in data.task_d_cols}
|
35 |
+
)
|
36 |
+
preds_df['b_label'] = np.clip(preds_df['b_label'], 0, 1)
|
37 |
+
# show the dataframe
|
38 |
+
table = st.table(preds_df)
|
models/2d_ridge_roberta-suicide-regchain-pca-final.pkl
ADDED
Binary file (154 kB). View file
|
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
sentence-transformers
|
3 |
+
pandas
|
4 |
+
streamlit
|
5 |
+
scikit-learn>=1.2.1
|
src/__init__.py
ADDED
File without changes
|
src/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (172 Bytes). View file
|
|
src/__pycache__/class_eval.cpython-38.pyc
ADDED
Binary file (15.7 kB). View file
|
|
src/__pycache__/data.cpython-38.pyc
ADDED
Binary file (4.34 kB). View file
|
|
src/__pycache__/embeddings.cpython-38.pyc
ADDED
Binary file (2.15 kB). View file
|
|
src/__pycache__/eval.cpython-38.pyc
ADDED
Binary file (7.51 kB). View file
|
|
src/__pycache__/multiregression.cpython-38.pyc
ADDED
Binary file (4.34 kB). View file
|
|
src/__pycache__/roberta_regressor.cpython-38.pyc
ADDED
Binary file (6.73 kB). View file
|
|
src/__pycache__/utils.cpython-38.pyc
ADDED
Binary file (2.52 kB). View file
|
|
src/berta_finetuning.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
3 |
+
from datasets import Dataset, load_dataset#, Features, Value, ClassLabe
|
4 |
+
|
5 |
+
ds = load_dataset('nlpUc3mStudents/mental-risk-c')
|
6 |
+
# to pandas
|
7 |
+
train_df = ds['train'].to_pandas()
|
8 |
+
test_df = ds['test'].to_pandas()
|
9 |
+
label_names = train_df.iloc[:,4:].columns.tolist()
|
10 |
+
# concat messages by subject id
|
11 |
+
train_by_subjectid = (
|
12 |
+
train_df.groupby('subject_id')
|
13 |
+
.agg({'message': lambda x: ' | '.join(x), **{col: 'first' for col in label_names}})
|
14 |
+
.reset_index()
|
15 |
+
# .assign(
|
16 |
+
# num_messages=lambda x: x.message.str.count('\|') + 1
|
17 |
+
# )
|
18 |
+
)
|
19 |
+
# back to datasets
|
20 |
+
train_df = Dataset.from_pandas(train_by_subjectid)
|
21 |
+
|
22 |
+
model_name= 'hackathon-somos-nlp-2023/roberta-base-bne-finetuned-suicide-es'
|
23 |
+
|
24 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
25 |
+
# this model is trained with 2 labels, yet we need 4, so we need to change the head
|
26 |
+
model = None
|
27 |
+
|
28 |
+
|
src/class_eval.py
ADDED
@@ -0,0 +1,576 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#This file has been developed by the SINAI research group for its usage in the MentalRiskES evaluation campaign at IberLEF 2023.
|
2 |
+
|
3 |
+
# Required libraries
|
4 |
+
import pandas as pd
|
5 |
+
import numpy as np
|
6 |
+
import sklearn.metrics as metrics
|
7 |
+
from scipy.stats import pearsonr
|
8 |
+
|
9 |
+
# Read Gold labels for BinaryClassification
|
10 |
+
def read_qrels(qrels_file):
|
11 |
+
qrels={}
|
12 |
+
df_golden_truth = pd.read_csv(qrels_file)
|
13 |
+
for index, r in df_golden_truth.iterrows():
|
14 |
+
qrels[ r['Subject'] ] = int(r['label'])
|
15 |
+
print("\n"+str(len(qrels))+ " lines read in qrels file!\n\n")
|
16 |
+
return(qrels)
|
17 |
+
|
18 |
+
# Read Gold labels for Simple Regression
|
19 |
+
def read_qrels_regression(qrels_file):
|
20 |
+
qrels={}
|
21 |
+
df_golden_truth = pd.read_csv(qrels_file)
|
22 |
+
for index, r in df_golden_truth.iterrows():
|
23 |
+
qrels[ r['Subject'] ] = float(r['label'])
|
24 |
+
print("\n"+str(len(qrels))+ " lines read in qrels file!\n\n")
|
25 |
+
return(qrels)
|
26 |
+
|
27 |
+
# Read Gold labels for Multiclass classification
|
28 |
+
def read_qrels_multiclass(qrels_file):
|
29 |
+
qrels={}
|
30 |
+
qrels1 = {}
|
31 |
+
df_golden_truth = pd.read_csv(qrels_file)
|
32 |
+
for index, r in df_golden_truth.iterrows():
|
33 |
+
qrels1[ r['Subject'] ] = r['label']
|
34 |
+
if "suffer" in r['label']:
|
35 |
+
qrels[ r['Subject'] ] = 1
|
36 |
+
else:
|
37 |
+
qrels[ r['Subject'] ] = 0
|
38 |
+
print("\n"+str(len(qrels))+ " lines read in qrels file!\n\n")
|
39 |
+
return qrels, qrels1
|
40 |
+
|
41 |
+
# Read Gold labels for Multi-output regression
|
42 |
+
def read_qrels_multioutput(qrels_file):
|
43 |
+
qrels={}
|
44 |
+
df_golden_truth = pd.read_csv(qrels_file)
|
45 |
+
for index, r in df_golden_truth.iterrows():
|
46 |
+
qrels[ r['Subject'] ] = [r['suffer_in_favour'],r['suffer_against'],r['suffer_other'],r['control']]
|
47 |
+
print("\n"+str(len(qrels))+ " lines read in qrels file!\n\n")
|
48 |
+
return qrels
|
49 |
+
|
50 |
+
###########################################################################
|
51 |
+
# Calculation of Binary classification metrics for Binary classification tasks
|
52 |
+
class BinaryClassification():
|
53 |
+
def __init__(self, task, data, qrels):
|
54 |
+
self.run_results = data
|
55 |
+
self.qrels_b = read_qrels(qrels)
|
56 |
+
self.task = task
|
57 |
+
pass
|
58 |
+
|
59 |
+
def penalty(self,delay):
|
60 |
+
if self.task == "1": # TCA
|
61 |
+
p = 0.0292 # trial
|
62 |
+
elif self.task == "2": # Depression
|
63 |
+
p = 0.0179 # trial
|
64 |
+
pen = -1.0 + 2.0/(1+np.exp(-p*(delay-1)))
|
65 |
+
return(pen)
|
66 |
+
|
67 |
+
def n_pos(self):
|
68 |
+
total_pos = 0
|
69 |
+
for key in self.qrels_b:
|
70 |
+
total_pos += self.qrels_b[key]
|
71 |
+
return(total_pos)
|
72 |
+
|
73 |
+
def eval_performance(self):
|
74 |
+
print("===================================================")
|
75 |
+
print("DECISION-BASED EVALUATION:")
|
76 |
+
self.run_results = self.run_results.sort_values(by=['nick'])
|
77 |
+
total_pos=self.n_pos()
|
78 |
+
erdes5 = np.zeros(len(self.run_results))
|
79 |
+
erdes30 = np.zeros(len(self.run_results))
|
80 |
+
erdes50 = np.zeros(len(self.run_results))
|
81 |
+
ierdes = 0
|
82 |
+
true_pos = 0
|
83 |
+
false_pos = 0
|
84 |
+
latency_tps = list()
|
85 |
+
penalty_tps = list()
|
86 |
+
|
87 |
+
# Latency-based metrics
|
88 |
+
for index, r in self.run_results.iterrows():
|
89 |
+
try:
|
90 |
+
|
91 |
+
if ( self.qrels_b[ r['nick'] ] == r['pred'] ):
|
92 |
+
if ( r['pred'] == 1 ):
|
93 |
+
true_pos+=1
|
94 |
+
erdes5[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 5.0)))
|
95 |
+
erdes30[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 30.0)))
|
96 |
+
erdes50[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 50.0)))
|
97 |
+
latency_tps.append(r["round"]+1)
|
98 |
+
penalty_tps.append(self.penalty(r["round"]+1))
|
99 |
+
else:
|
100 |
+
erdes5[ierdes]=0
|
101 |
+
erdes30[ierdes]=0
|
102 |
+
erdes50[ierdes]=0
|
103 |
+
else:
|
104 |
+
if ( r['pred'] == 1 ):
|
105 |
+
false_pos+=1
|
106 |
+
erdes5[ierdes]=float(total_pos)/float(len(self.qrels_b))
|
107 |
+
erdes50[ierdes]=float(total_pos)/float(len(self.qrels_b))
|
108 |
+
else:
|
109 |
+
erdes5[ierdes]=1
|
110 |
+
erdes30[ierdes]=1
|
111 |
+
erdes50[ierdes]=1
|
112 |
+
except KeyError:
|
113 |
+
print("User does not appear in the qrels:"+r['nick'])
|
114 |
+
ierdes+=1
|
115 |
+
|
116 |
+
_speed = 1-np.median(np.array(penalty_tps))
|
117 |
+
if true_pos != 0 :
|
118 |
+
precision = float(true_pos) / float(true_pos+false_pos)
|
119 |
+
recall = float(true_pos) / float(total_pos)
|
120 |
+
f1_erde = 2 * (precision * recall) / (precision + recall)
|
121 |
+
_latencyweightedF1 = f1_erde*_speed
|
122 |
+
else:
|
123 |
+
_latencyweightedF1 = 0
|
124 |
+
_speed = 0
|
125 |
+
|
126 |
+
y_true = self.run_results['pred'].tolist()
|
127 |
+
y_pred_b = list(self.qrels_b.values())
|
128 |
+
|
129 |
+
# Binary metrics
|
130 |
+
accuracy = metrics.accuracy_score(y_true, y_pred_b)
|
131 |
+
macro_precision = metrics.precision_score(y_true, y_pred_b, average='macro')
|
132 |
+
macro_recall = metrics.recall_score(y_true, y_pred_b, average='macro')
|
133 |
+
macro_f1 = metrics.f1_score(y_true, y_pred_b, average='macro')
|
134 |
+
micro_precision = metrics.precision_score(y_true, y_pred_b, average='micro')
|
135 |
+
micro_recall = metrics.recall_score(y_true, y_pred_b, average='micro')
|
136 |
+
micro_f1 = metrics.f1_score(y_true, y_pred_b, average='micro')
|
137 |
+
|
138 |
+
print("BINARY METRICS: =============================")
|
139 |
+
print("Accuracy:"+str(accuracy))
|
140 |
+
print("Macro precision:"+str(macro_precision))
|
141 |
+
print("Macro recall:"+str(macro_recall))
|
142 |
+
print("Macro f1:"+str(macro_f1))
|
143 |
+
print("Micro precision:"+str(micro_precision))
|
144 |
+
print("Micro recall:"+str(micro_recall))
|
145 |
+
print("Micro f1:"+str(micro_f1))
|
146 |
+
|
147 |
+
print("LATENCY-BASED METRICS: =============================")
|
148 |
+
print("ERDE_5:"+str(np.mean(erdes5)))
|
149 |
+
print("ERDE_50:"+str(np.mean(erdes50)))
|
150 |
+
print("Median latency:"+str(np.median(np.array(latency_tps))))
|
151 |
+
print("Speed:"+str(_speed))
|
152 |
+
print("latency-weightedF1:"+str(_latencyweightedF1))
|
153 |
+
|
154 |
+
return {'Accuracy': accuracy, 'Macro_P': macro_precision, 'Macro_R': macro_recall,'Macro_F1': macro_f1,'Micro_P': micro_precision, 'Micro_R': micro_recall,
|
155 |
+
'Micro_F1': micro_f1, 'ERDE5':np.mean(erdes5),'ERDE30': np.mean(erdes30),'ERDE50': np.mean(erdes50), 'latencyTP': np.median(np.array(latency_tps)),
|
156 |
+
'speed': _speed, 'latency-weightedF1': _latencyweightedF1}
|
157 |
+
|
158 |
+
# Calculation of P@10, P@20, P@30, P@50
|
159 |
+
def eval_performance_rank_based(self):
|
160 |
+
print("===================================================")
|
161 |
+
print("RANK-BASED EVALUATION:")
|
162 |
+
ranks_at=[1,50,75]
|
163 |
+
rank_dit = {}
|
164 |
+
for rank in ranks_at:
|
165 |
+
print("Analizing ranking at round "+str(rank))
|
166 |
+
rels_topk = [0,0,0,0]
|
167 |
+
self.run_results["label"] = self.qrels_b.values()
|
168 |
+
self.run_results = self.run_results.sort_values(by=['pred'],ascending=False)
|
169 |
+
i = 0
|
170 |
+
for index, r in self.run_results.iterrows():
|
171 |
+
if i<10:
|
172 |
+
if r["pred"] == r['label']:
|
173 |
+
rels_topk[0] += 1
|
174 |
+
rels_topk[1] += 1
|
175 |
+
rels_topk[2] += 1
|
176 |
+
rels_topk[3] += 1
|
177 |
+
elif i<20:
|
178 |
+
if r["pred"] == r['label']:
|
179 |
+
rels_topk[1] += 1
|
180 |
+
rels_topk[2] += 1
|
181 |
+
rels_topk[3] += 1
|
182 |
+
elif i<30:
|
183 |
+
if r["pred"] == r['label']:
|
184 |
+
rels_topk[2] += 1
|
185 |
+
rels_topk[3] += 1
|
186 |
+
elif i<50:
|
187 |
+
if r["pred"] == r['label']:
|
188 |
+
rels_topk[3] += 1
|
189 |
+
else:
|
190 |
+
break
|
191 |
+
i+=1
|
192 |
+
p10 = float(rels_topk[0])/10.0
|
193 |
+
p20 = float(rels_topk[1])/20.0
|
194 |
+
p30 = float(rels_topk[2])/30.0
|
195 |
+
p50 = float(rels_topk[3])/50.0
|
196 |
+
|
197 |
+
print("PRECISION AT K: =============================")
|
198 |
+
print("P@10:"+str(p10))
|
199 |
+
print("P@20:"+str(p20))
|
200 |
+
print("P@30:"+str(p30))
|
201 |
+
print("P@50:"+str(p50))
|
202 |
+
rank_dit[rank] = {"@10":p10,"@20":p20,"@30":p30,"@50":p50}
|
203 |
+
return rank_dit
|
204 |
+
|
205 |
+
|
206 |
+
#############################################################################################
|
207 |
+
# Calculation of Regression metrics for Simple regression tasks
|
208 |
+
class ClassRegressionEvaluation():
|
209 |
+
def __init__(self, task, data, qrels):
|
210 |
+
self.run_results = data
|
211 |
+
self.qrels = read_qrels_regression(qrels)
|
212 |
+
self.task = task
|
213 |
+
|
214 |
+
def eval_performance(self):
|
215 |
+
self.run_results = self.run_results.sort_values(by=['nick'])
|
216 |
+
y_true = self.run_results['pred'].tolist()
|
217 |
+
|
218 |
+
y_pred_r = list(self.qrels.values())
|
219 |
+
|
220 |
+
# Regression metrics
|
221 |
+
_rmse = metrics.mean_squared_error(y_true, y_pred_r, sample_weight=None, multioutput='raw_values', squared=False)[0]
|
222 |
+
_pearson = np.corrcoef(y_true, y_pred_r)
|
223 |
+
_pearson, _ = pearsonr(y_true, y_pred_r)
|
224 |
+
|
225 |
+
print("REGRESSION METRICS: =============================")
|
226 |
+
print("RMSE:"+str(_rmse))
|
227 |
+
print("Pearson correlation coefficient:"+str(_pearson))
|
228 |
+
|
229 |
+
return { 'RMSE:': _rmse, 'Pearson_coefficient': _pearson}
|
230 |
+
|
231 |
+
# Calculation of P@10, P@20, P@30, P@50
|
232 |
+
def eval_performance_rank_based(self):
|
233 |
+
print("===================================================")
|
234 |
+
print("RANK-BASED EVALUATION:")
|
235 |
+
ranks_at=[1,25,50,75]
|
236 |
+
rank_dit = {}
|
237 |
+
for rank in ranks_at:
|
238 |
+
print("Analizing ranking at round "+str(rank))
|
239 |
+
rels_topk = [0,0,0,0,0]
|
240 |
+
self.run_results_ = self.run_results[rank].sort_values(by=['nick'])
|
241 |
+
self.run_results_["label"] = self.qrels.values()
|
242 |
+
self.run_results_ = self.run_results_.sort_values(by=['pred'],ascending=False)
|
243 |
+
i = 0
|
244 |
+
for index, r in self.run_results_.iterrows():
|
245 |
+
if i<5:
|
246 |
+
if r["label"] == round(r["pred"],1):
|
247 |
+
rels_topk[0] += 1
|
248 |
+
rels_topk[1] += 1
|
249 |
+
rels_topk[2] += 1
|
250 |
+
rels_topk[3] += 1
|
251 |
+
rels_topk[4] += 1
|
252 |
+
elif i<10:
|
253 |
+
if r['label'] == round(r["pred"],1):
|
254 |
+
rels_topk[1] += 1
|
255 |
+
rels_topk[2] += 1
|
256 |
+
rels_topk[3] += 1
|
257 |
+
rels_topk[4] += 1
|
258 |
+
elif i<20:
|
259 |
+
if r['label'] == round(r["pred"],1):
|
260 |
+
rels_topk[2] += 1
|
261 |
+
rels_topk[3] += 1
|
262 |
+
rels_topk[4] += 1
|
263 |
+
elif i<30:
|
264 |
+
if r['label'] == round(r["pred"],1):
|
265 |
+
rels_topk[3] += 1
|
266 |
+
rels_topk[4] += 1
|
267 |
+
elif i<50:
|
268 |
+
if r['label'] == round(r["pred"],1):
|
269 |
+
rels_topk[4] += 1
|
270 |
+
else:
|
271 |
+
break
|
272 |
+
i+=1
|
273 |
+
p5 = float(rels_topk[0])/5.0
|
274 |
+
p10 = float(rels_topk[1])/10.0
|
275 |
+
p20 = float(rels_topk[2])/20.0
|
276 |
+
p30 = float(rels_topk[3])/30.0
|
277 |
+
p50 = float(rels_topk[4])/50.0
|
278 |
+
|
279 |
+
print("PRECISION AT K: =============================")
|
280 |
+
print("P@5:"+str(p5))
|
281 |
+
print("P@10:"+str(p10))
|
282 |
+
print("P@20:"+str(p20))
|
283 |
+
print("P@30:"+str(p30))
|
284 |
+
print("P@50:"+str(p50))
|
285 |
+
rank_dit[rank] = {"@5":p5,"@10":p10,"@20":p20,"@30":p30,"@50":p50}
|
286 |
+
return rank_dit
|
287 |
+
|
288 |
+
|
289 |
+
############################################################################
|
290 |
+
# Calculation of Binary metrics for Multiclass classification tasks
|
291 |
+
class BinaryMultiClassification():
|
292 |
+
def __init__(self, task, data, qrels):
|
293 |
+
self.run_results = data
|
294 |
+
self.qrels_b, self.qrels_multiclass = read_qrels_multiclass(qrels)
|
295 |
+
self.task = task
|
296 |
+
pass
|
297 |
+
|
298 |
+
def penalty(self,delay):
|
299 |
+
if self.task == "1": # TCA
|
300 |
+
p = 0.0411 # test
|
301 |
+
p = 0.0292 # trial
|
302 |
+
elif self.task == "2": # Depression
|
303 |
+
p = 0.0326 # test
|
304 |
+
p = 0.0179 # trial
|
305 |
+
else: # Unkown
|
306 |
+
p = 0.0308 # test
|
307 |
+
pen = -1.0 + 2.0/(1+np.exp(-p*(delay-1)))
|
308 |
+
return(pen)
|
309 |
+
|
310 |
+
def n_pos(self):
|
311 |
+
total_pos = 0
|
312 |
+
for key in self.qrels_b:
|
313 |
+
total_pos += self.qrels_b[key]
|
314 |
+
return(total_pos)
|
315 |
+
|
316 |
+
|
317 |
+
def eval_performance(self):
|
318 |
+
print("===================================================")
|
319 |
+
print("DECISION-BASED EVALUATION:")
|
320 |
+
self.run_results = self.run_results.sort_values(by=['nick'])
|
321 |
+
total_pos=self.n_pos() # Total number of positive documents
|
322 |
+
erdes5 = np.zeros(len(self.run_results))
|
323 |
+
erdes30 = np.zeros(len(self.run_results))
|
324 |
+
erdes50 = np.zeros(len(self.run_results))
|
325 |
+
ierdes = 0
|
326 |
+
true_pos = 0
|
327 |
+
false_pos = 0
|
328 |
+
latency_tps = list()
|
329 |
+
penalty_tps = list()
|
330 |
+
|
331 |
+
for index, r in self.run_results.iterrows():
|
332 |
+
try:
|
333 |
+
|
334 |
+
if ( self.qrels_b[ r['nick'] ] == r['pred_b'] ):
|
335 |
+
if ( r['pred_b'] == 1 ):
|
336 |
+
true_pos+=1
|
337 |
+
erdes5[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 5.0)))
|
338 |
+
erdes30[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 30.0)))
|
339 |
+
erdes50[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 50.0)))
|
340 |
+
latency_tps.append(r["round"]+1)
|
341 |
+
penalty_tps.append(self.penalty(r["round"]+1))
|
342 |
+
else:
|
343 |
+
erdes5[ierdes]=0
|
344 |
+
erdes30[ierdes]=0
|
345 |
+
erdes50[ierdes]=0
|
346 |
+
else:
|
347 |
+
if ( r['pred_b'] == 1 ):
|
348 |
+
false_pos+=1
|
349 |
+
erdes5[ierdes]=float(total_pos)/float(len(self.qrels_b))
|
350 |
+
erdes30[ierdes]=float(total_pos)/float(len(self.qrels_b))
|
351 |
+
erdes50[ierdes]=float(total_pos)/float(len(self.qrels_b))
|
352 |
+
else:
|
353 |
+
erdes5[ierdes]=1
|
354 |
+
erdes30[ierdes]=1
|
355 |
+
erdes50[ierdes]=1
|
356 |
+
except KeyError:
|
357 |
+
print("User does not appear in the qrels:"+r['nick'])
|
358 |
+
ierdes+=1
|
359 |
+
|
360 |
+
_speed = 1-np.median(np.array(penalty_tps))
|
361 |
+
if true_pos != 0 :
|
362 |
+
precision = float(true_pos) / float(true_pos+false_pos)
|
363 |
+
recall = float(true_pos) / float(total_pos)
|
364 |
+
f1_erde = 2 * (precision * recall) / (precision + recall)
|
365 |
+
_latencyweightedF1 = f1_erde*_speed
|
366 |
+
else:
|
367 |
+
_latencyweightedF1 = 0
|
368 |
+
_speed = 0
|
369 |
+
|
370 |
+
y_true = self.run_results['pred'].tolist()
|
371 |
+
y_pred_b = list(self.qrels_multiclass.values())
|
372 |
+
|
373 |
+
# Binary metrics
|
374 |
+
accuracy = metrics.accuracy_score(y_true, y_pred_b)
|
375 |
+
macro_precision = metrics.precision_score(y_true, y_pred_b, average='macro')
|
376 |
+
macro_recall = metrics.recall_score(y_true, y_pred_b, average='macro')
|
377 |
+
macro_f1 = metrics.f1_score(y_true, y_pred_b, average='macro')
|
378 |
+
micro_precision = metrics.precision_score(y_true, y_pred_b, average='micro')
|
379 |
+
micro_recall = metrics.recall_score(y_true, y_pred_b, average='micro')
|
380 |
+
micro_f1 = metrics.f1_score(y_true, y_pred_b, average='micro')
|
381 |
+
|
382 |
+
print("BINARY METRICS: =============================")
|
383 |
+
print("Accuracy:"+str(accuracy))
|
384 |
+
print("Macro precision:"+str(macro_precision))
|
385 |
+
print("Macro recall:"+str(macro_recall))
|
386 |
+
print("Macro f1:"+str(macro_f1))
|
387 |
+
print("Micro precision:"+str(micro_precision))
|
388 |
+
print("Micro recall:"+str(micro_recall))
|
389 |
+
print("Micro f1:"+str(micro_f1))
|
390 |
+
|
391 |
+
print("LATENCY-BASED METRICS: =============================")
|
392 |
+
print("ERDE_5:"+str(np.mean(erdes5)))
|
393 |
+
print("ERDE_50:"+str(np.mean(erdes50)))
|
394 |
+
print("Median latency:"+str(np.median(np.array(latency_tps))))
|
395 |
+
print("Speed:"+str(_speed))
|
396 |
+
print("latency-weightedF1:"+str(_latencyweightedF1))
|
397 |
+
|
398 |
+
return {'Accuracy': accuracy, 'Macro_P': macro_precision, 'Macro_R': macro_recall,'Macro_F1': macro_f1,'Micro_P': micro_precision, 'Micro_R': micro_recall,
|
399 |
+
'Micro_F1': micro_f1, 'ERDE5':np.mean(erdes5),'ERDE30':np.mean(erdes30),'ERDE50': np.mean(erdes50), 'latencyTP': np.median(np.array(latency_tps)),
|
400 |
+
'speed': _speed, 'latency-weightedF1': _latencyweightedF1}
|
401 |
+
|
402 |
+
# Calculation of P@10, P@20, P@30, P@50
|
403 |
+
def eval_performance_rank_based(self):
|
404 |
+
print("===================================================")
|
405 |
+
print("PRECISION AT K - EVALUATION:")
|
406 |
+
ranks_at=[1,50,75]
|
407 |
+
rank_dit = {}
|
408 |
+
for rank in ranks_at:
|
409 |
+
print("Analizing ranking at round "+str(rank))
|
410 |
+
rels_topk = [0,0,0,0]
|
411 |
+
self.run_results["label"] = self.qrels_b.values()
|
412 |
+
self.run_results = self.run_results.sort_values(by=['pred_b'],ascending=False)
|
413 |
+
i = 0
|
414 |
+
for index, r in self.run_results.iterrows():
|
415 |
+
if i<10:
|
416 |
+
if r["pred_b"] == r['label']:
|
417 |
+
rels_topk[0] += 1
|
418 |
+
rels_topk[1] += 1
|
419 |
+
rels_topk[2] += 1
|
420 |
+
rels_topk[3] += 1
|
421 |
+
elif i<20:
|
422 |
+
if r["pred_b"] == r['label']:
|
423 |
+
rels_topk[1] += 1
|
424 |
+
rels_topk[2] += 1
|
425 |
+
rels_topk[3] += 1
|
426 |
+
elif i<30:
|
427 |
+
if r["pred_b"] == r['label']:
|
428 |
+
rels_topk[2] += 1
|
429 |
+
rels_topk[3] += 1
|
430 |
+
elif i<50:
|
431 |
+
if r["pred_b"] == r['label']:
|
432 |
+
rels_topk[3] += 1
|
433 |
+
else:
|
434 |
+
break
|
435 |
+
i+=1
|
436 |
+
p10 = float(rels_topk[0])/10.0
|
437 |
+
p20 = float(rels_topk[1])/20.0
|
438 |
+
p30 = float(rels_topk[2])/30.0
|
439 |
+
p50 = float(rels_topk[3])/50.0
|
440 |
+
|
441 |
+
print("PRECISION AT K: =============================")
|
442 |
+
print("P@10:"+str(p10))
|
443 |
+
print("P@20:"+str(p20))
|
444 |
+
print("P@30:"+str(p30))
|
445 |
+
print("P@50:"+str(p50))
|
446 |
+
rank_dit[rank] = {"@10":p10,"@20":p20,"@30":p30,"@50":p50}
|
447 |
+
return rank_dit
|
448 |
+
|
449 |
+
|
450 |
+
#######################################################################################
|
451 |
+
# Calculation of Regression metrics for Multi-output regression tasks
|
452 |
+
class ClassMultiRegressionEvaluation():
|
453 |
+
|
454 |
+
def __init__(self, task, data, qrels):
|
455 |
+
self.run_results = data
|
456 |
+
self.qrels = read_qrels_multioutput(qrels)
|
457 |
+
self.task = task
|
458 |
+
|
459 |
+
def eval_performance(self):
|
460 |
+
self.run_results = self.run_results.sort_values(by=['nick'])
|
461 |
+
y_true = self.run_results['pred'].tolist()
|
462 |
+
y_pred_r = list(self.qrels.values())
|
463 |
+
|
464 |
+
# Regression metrics
|
465 |
+
_rmse = metrics.mean_squared_error(y_true, y_pred_r, sample_weight=None, multioutput='raw_values', squared=False)[0]
|
466 |
+
_pearson_sf, _ = pearsonr([item[0] for item in y_true] , [item[0] for item in y_pred_r])
|
467 |
+
_pearson_sa, _ = pearsonr([item[1] for item in y_true] , [item[1] for item in y_pred_r])
|
468 |
+
_pearson_so, _ = pearsonr([item[2] for item in y_true] , [item[2] for item in y_pred_r])
|
469 |
+
_pearson_c, _ = pearsonr([item[3] for item in y_true] , [item[3] for item in y_pred_r])
|
470 |
+
|
471 |
+
print("REGRESSION METRICS: =============================")
|
472 |
+
print("RMSE:"+str(_rmse))
|
473 |
+
print("Pearson correlation coefficient:")
|
474 |
+
print("Pearson sf:"+str(_pearson_sf))
|
475 |
+
print("Pearson sa:"+str(_pearson_sa))
|
476 |
+
print("Pearson so:"+str(_pearson_so))
|
477 |
+
print("Pearson c:"+str(_pearson_c))
|
478 |
+
pearson = (_pearson_sf + _pearson_sa + _pearson_so + _pearson_c)/4
|
479 |
+
return { 'RMSE:': _rmse, 'Pearson_mean': pearson,'Pearson_sf': _pearson_sf, 'Pearson_sa': _pearson_sa,'Pearson_so': _pearson_so,'Pearson_c': _pearson_c}
|
480 |
+
|
481 |
+
# Calculation of P@10, P@20, P@30, P@50
|
482 |
+
def eval_performance_rank_based(self):
|
483 |
+
print("===================================================")
|
484 |
+
print("PRECISION AT - EVALUATION:")
|
485 |
+
ranks_at=[1,25,50,75]
|
486 |
+
rank_dit = {}
|
487 |
+
for rank in ranks_at:
|
488 |
+
print("Analizing ranking at round "+str(rank))
|
489 |
+
self.run_results_ = self.run_results[rank].sort_values(by=['nick'])
|
490 |
+
self.run_results_["label"] = self.qrels.values()
|
491 |
+
self.run_results_ = self.run_results_.sort_values(by=['pred'],ascending=False)
|
492 |
+
p5 = 0
|
493 |
+
p10 = 0
|
494 |
+
p20 = 0
|
495 |
+
p30 = 0
|
496 |
+
p50 = 0
|
497 |
+
for j in range(0,4):
|
498 |
+
rels_topk = [0,0,0,0,0]
|
499 |
+
i = 0
|
500 |
+
for index, r in self.run_results_.iterrows():
|
501 |
+
if i<5:
|
502 |
+
if r['label'][j] == round(r["pred"][j],1):
|
503 |
+
rels_topk[0] += 1
|
504 |
+
rels_topk[1] += 1
|
505 |
+
rels_topk[2] += 1
|
506 |
+
rels_topk[3] += 1
|
507 |
+
rels_topk[4] += 1
|
508 |
+
elif i<10:
|
509 |
+
if r['label'][j] == round(r["pred"][j],1):
|
510 |
+
rels_topk[0] += 1
|
511 |
+
rels_topk[1] += 1
|
512 |
+
rels_topk[2] += 1
|
513 |
+
rels_topk[3] += 1
|
514 |
+
elif i<20:
|
515 |
+
if r['label'][j] == round(r["pred"][j],1):
|
516 |
+
rels_topk[1] += 1
|
517 |
+
rels_topk[2] += 1
|
518 |
+
rels_topk[3] += 1
|
519 |
+
elif i<30:
|
520 |
+
if r['label'][j] == round(r["pred"][j],1):
|
521 |
+
rels_topk[2] += 1
|
522 |
+
rels_topk[3] += 1
|
523 |
+
elif i<50:
|
524 |
+
if r['label'][j] == round(r["pred"][j],1):
|
525 |
+
rels_topk[3] += 1
|
526 |
+
else:
|
527 |
+
break
|
528 |
+
i+=1
|
529 |
+
p5 += float(rels_topk[0])/5.0
|
530 |
+
p10 += float(rels_topk[0])/10.0
|
531 |
+
p20 += float(rels_topk[1])/20.0
|
532 |
+
p30 += float(rels_topk[2])/30.0
|
533 |
+
p50 += float(rels_topk[3])/50.0
|
534 |
+
|
535 |
+
print("PRECISION AT K: =============================")
|
536 |
+
print("P@5:"+str(p5/4))
|
537 |
+
print("P@10:"+str(p10/4))
|
538 |
+
print("P@20:"+str(p20/4))
|
539 |
+
print("P@30:"+str(p30/4))
|
540 |
+
print("P@50:"+str(p50/4))
|
541 |
+
rank_dit[rank] = {"@5":p5/4,"@10":p10/4,"@20":p20/4,"@30":p30/4,"@50":p50/4}
|
542 |
+
return rank_dit
|
543 |
+
|
544 |
+
|
545 |
+
# Class for calculating carbon emission values
|
546 |
+
class Emissions():
|
547 |
+
def __init__(self, emissions_run) -> None:
|
548 |
+
self.emissions_run = emissions_run
|
549 |
+
self.aux = {}
|
550 |
+
for key, value in emissions_run.items():
|
551 |
+
self.aux[key] = 0
|
552 |
+
pass
|
553 |
+
|
554 |
+
# Update of values after a prediction has been made
|
555 |
+
def update_emissions(self,emissions_round):
|
556 |
+
# The values are accumulated in each round, so the difference is calculated to know the values for that round only
|
557 |
+
for key, value in self.emissions_run.items():
|
558 |
+
if key not in ["cpu_count","gpu_count","cpu_model","gpu_model", "ram_total_size"]:
|
559 |
+
round_ = emissions_round[key] - self.aux[key]
|
560 |
+
self.emissions_run[key].append(round_)
|
561 |
+
self.aux[key] = emissions_round[key]
|
562 |
+
|
563 |
+
# Calculation of final values after all predictions have been made
|
564 |
+
def calculate_emissions(self):
|
565 |
+
dict_ = {}
|
566 |
+
for key, value in self.emissions_run.items():
|
567 |
+
# Non-numerical values
|
568 |
+
if key in ["cpu_count","gpu_count","cpu_model","gpu_model", "ram_total_size"]:
|
569 |
+
dict_[key] = self.emissions_run[key][0]
|
570 |
+
# Numerical values
|
571 |
+
else:
|
572 |
+
dict_[key+"_min"] = min(self.emissions_run[key])
|
573 |
+
dict_[key+"_max"] = max(self.emissions_run[key])
|
574 |
+
dict_[key+"_mean"] = sum(self.emissions_run[key])/len(self.emissions_run[key])
|
575 |
+
dict_[key+"_var"] = np.var(self.emissions_run[key])
|
576 |
+
return dict_
|
src/data.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests, os, glob
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
train_dir = "./data/train"
|
6 |
+
test_dir = "./data/test"
|
7 |
+
truth_dir = "golden_truth"
|
8 |
+
|
9 |
+
def load(set_name:str='train', with_labels:bool=True) -> pd.DataFrame:
|
10 |
+
"""
|
11 |
+
Load the data from the csv files
|
12 |
+
"""
|
13 |
+
if set_name == 'train':
|
14 |
+
path = train_dir
|
15 |
+
elif set_name == 'test':
|
16 |
+
path = test_dir
|
17 |
+
else:
|
18 |
+
raise ValueError("set_name must be either 'train' or 'test'")
|
19 |
+
if not os.path.exists(path):
|
20 |
+
if set_name=="train":
|
21 |
+
df = get_train(with_labels=with_labels)
|
22 |
+
else:
|
23 |
+
df = get_test(with_labels=with_labels)
|
24 |
+
else:
|
25 |
+
data_files = glob.glob(os.path.join(path, '*.json'))
|
26 |
+
if with_labels:
|
27 |
+
truth_path = os.path.join(path, truth_dir, 'task2_gold_d.txt')
|
28 |
+
truth_df = pd.read_csv(truth_path).rename(
|
29 |
+
columns=lambda s: 'd_' + s if s != 'Subject' else 'subject_id'
|
30 |
+
)
|
31 |
+
else:
|
32 |
+
truth_df = None
|
33 |
+
df = load_from_files(data_files, truth=truth_df)
|
34 |
+
abc_labels_df = make_task_labels_from_d(df.filter(regex='^d_').values.astype(float))
|
35 |
+
df = pd.concat([df, abc_labels_df], axis=1)
|
36 |
+
return df
|
37 |
+
|
38 |
+
def concat_messages(df:pd.DataFrame, sep:str=' | ') -> pd.DataFrame:
|
39 |
+
"""
|
40 |
+
Concatenate all the messages of a subject into a single message
|
41 |
+
"""
|
42 |
+
df = (
|
43 |
+
df
|
44 |
+
.assign(date=lambda x: pd.to_datetime(x['date']))
|
45 |
+
.sort_values(['subject_id', 'date'], ascending=[True, True])
|
46 |
+
.groupby('subject_id')
|
47 |
+
.agg({
|
48 |
+
'message': lambda x: sep.join(x),
|
49 |
+
'round': 'last',
|
50 |
+
**{c: 'first' for c in df.columns.drop(['subject_id', 'message', 'round'])}
|
51 |
+
}).sort_index()
|
52 |
+
.reset_index()
|
53 |
+
)
|
54 |
+
return df
|
55 |
+
|
56 |
+
def load_from_files(files, truth=None):
|
57 |
+
"""load all the data into a dataframe"""
|
58 |
+
import os, json
|
59 |
+
data = []
|
60 |
+
for f in files:
|
61 |
+
with open(f) as file:
|
62 |
+
msgs = json.load(file)
|
63 |
+
for msg in msgs:
|
64 |
+
data.append([
|
65 |
+
msg.get('nick',os.path.basename(f).split('.')[0]),
|
66 |
+
msg.get('round', -1),
|
67 |
+
msg['id_message'],
|
68 |
+
msg['date'],
|
69 |
+
msg['message']])
|
70 |
+
df = pd.DataFrame(data, columns=['subject_id', 'round', 'id_message', 'date', 'message'])
|
71 |
+
if truth is not None:
|
72 |
+
df = df.merge(truth, on='subject_id')
|
73 |
+
return df
|
74 |
+
|
75 |
+
def get_train(hf_token:str):
|
76 |
+
from datasets import load_dataset, Dataset
|
77 |
+
ds = load_dataset('nlpUc3mStudents/mental-risk-d')
|
78 |
+
train_df = ds['train'].to_pandas()
|
79 |
+
return train_df
|
80 |
+
|
81 |
+
def get_test(hf_token:str):
|
82 |
+
raise NotImplementedError("Test data is not available")
|
83 |
+
|
84 |
+
|
85 |
+
task_d_cols = ['suffer+in favour', 'suffer+against', 'suffer+other', 'control']
|
86 |
+
|
87 |
+
def make_task_labels_from_d(d_data:np.ndarray, include_d:bool=False) -> pd.DataFrame:
|
88 |
+
"""
|
89 |
+
Get the labels of all other tasks from the labels of the d task
|
90 |
+
"""
|
91 |
+
if isinstance(d_data, pd.DataFrame):
|
92 |
+
d_df = d_data.astype(float)
|
93 |
+
else:
|
94 |
+
d_df = pd.DataFrame(d_data, columns=task_d_cols).astype(float)
|
95 |
+
df = d_df.assign(
|
96 |
+
c_label = lambda df: df.iloc[:,:-1].apply(
|
97 |
+
lambda x: df.columns[np.argmax(x)] if sum(x)>=0.5 else 'control', axis=1
|
98 |
+
),
|
99 |
+
a_label=lambda df: (df.c_label!='control').astype(int),
|
100 |
+
b_label = lambda df: df[task_d_cols[:-1]].sum(axis=1).round(2)
|
101 |
+
)
|
102 |
+
if not include_d:
|
103 |
+
df = df[['a_label', 'b_label', 'c_label']]
|
104 |
+
return df
|
src/embeddings.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Tuple, Dict, Any, Union
|
2 |
+
import numpy as np
|
3 |
+
from sklearn.base import BaseEstimator, RegressorMixin
|
4 |
+
from sklearn.multioutput import MultiOutputRegressor
|
5 |
+
from sentence_transformers import SentenceTransformer
|
6 |
+
from transformers import AutoTokenizer
|
7 |
+
import sklearn
|
8 |
+
from sklearn.pipeline import Pipeline
|
9 |
+
from sklearn.decomposition import PCA
|
10 |
+
from sklearn.preprocessing import StandardScaler
|
11 |
+
|
12 |
+
from copy import deepcopy
|
13 |
+
|
14 |
+
from . import utils
|
15 |
+
|
16 |
+
class EmbeddingsRegressor(BaseEstimator, RegressorMixin):
|
17 |
+
|
18 |
+
def __init__(
|
19 |
+
self,
|
20 |
+
encoder: Union[SentenceTransformer, AutoTokenizer],
|
21 |
+
regressor: Union[MultiOutputRegressor, BaseEstimator],
|
22 |
+
normalize_output: bool = True,
|
23 |
+
verbose: bool = False
|
24 |
+
):
|
25 |
+
self.encoder = encoder
|
26 |
+
self.regressor = regressor
|
27 |
+
self.normalize_output = normalize_output
|
28 |
+
self.encodings = None
|
29 |
+
self.verbose = verbose
|
30 |
+
|
31 |
+
def fit(self, X: List[str], y: List[Tuple[float, float, float, float]]) -> "EmbeddingsRegressor":
|
32 |
+
X = self.encoder.encode(X, show_progress_bar=self.verbose)
|
33 |
+
self.regressor.fit(X, y)
|
34 |
+
return self
|
35 |
+
|
36 |
+
def transform(self, X: List[str]) -> List[List[float]]:
|
37 |
+
X = self.encoder.encode(X, show_progress_bar=self.verbose)
|
38 |
+
self.encodings = X
|
39 |
+
return X
|
40 |
+
|
41 |
+
def predict(self, X: Union[List[str], np.array], encodings=False) -> Union[List[float],List[List[float]]]:
|
42 |
+
if not encodings:
|
43 |
+
X = self.encoder.encode(X, show_progress_bar=self.verbose)
|
44 |
+
self.encodings = X
|
45 |
+
pred = self.regressor.predict(X)
|
46 |
+
if self.normalize_output:
|
47 |
+
pred /= pred.sum(axis=1, keepdims=True)
|
48 |
+
return pred
|
49 |
+
|
src/eval.py
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, List, Tuple, Any, Callable
|
2 |
+
from dataclasses import dataclass
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
from sklearn.metrics import (
|
6 |
+
f1_score, accuracy_score, recall_score, confusion_matrix,
|
7 |
+
classification_report,
|
8 |
+
r2_score, mean_squared_error
|
9 |
+
)
|
10 |
+
|
11 |
+
|
12 |
+
@dataclass
|
13 |
+
class ClassificationScores:
|
14 |
+
precision: float
|
15 |
+
recall: float
|
16 |
+
f1: float
|
17 |
+
support: float = None
|
18 |
+
|
19 |
+
@classmethod
|
20 |
+
def from_dict(cls, d:Dict[str, float]) -> "ClassificationScores":
|
21 |
+
d = {k.split('-')[0]: v for k, v in d.items() if k.split('-')[0] in cls.__annotations__}
|
22 |
+
return cls(**d)
|
23 |
+
|
24 |
+
@dataclass
|
25 |
+
class RegressionScores:
|
26 |
+
r2: float
|
27 |
+
mse: float
|
28 |
+
rmse: float
|
29 |
+
|
30 |
+
@classmethod
|
31 |
+
def make(cls, true:np.ndarray, pred:np.ndarray) -> "RegressionScores":
|
32 |
+
return cls(
|
33 |
+
r2=r2_score(true, pred),
|
34 |
+
mse=mean_squared_error(true, pred),
|
35 |
+
rmse=mean_squared_error(true, pred, squared=False)
|
36 |
+
)
|
37 |
+
|
38 |
+
def __add__(self, other):
|
39 |
+
return RegressionScores(
|
40 |
+
r2=self.r2 + other.r2,
|
41 |
+
mse=self.mse + other.mse,
|
42 |
+
rmse=self.rmse + other.rmse
|
43 |
+
)
|
44 |
+
|
45 |
+
def __truediv__(self, other):
|
46 |
+
return RegressionScores(
|
47 |
+
r2=self.r2 / other,
|
48 |
+
mse=self.mse / other,
|
49 |
+
rmse=self.rmse / other
|
50 |
+
)
|
51 |
+
|
52 |
+
|
53 |
+
@dataclass
|
54 |
+
class ClassificationReport:
|
55 |
+
accuracy: float
|
56 |
+
confusion: np.ndarray
|
57 |
+
macro: ClassificationScores
|
58 |
+
weighted: ClassificationScores
|
59 |
+
labels: list
|
60 |
+
label_scores: Dict[str, ClassificationScores] # label -> ClassificationScores
|
61 |
+
|
62 |
+
f1: float = None # only for binary classification
|
63 |
+
recall: float = None # only for binary classification
|
64 |
+
|
65 |
+
@classmethod
|
66 |
+
def make_report(cls, true:np.ndarray, pred:np.ndarray) -> "ClassificationReport":
|
67 |
+
class_labels = np.unique(np.concatenate([true, pred]))
|
68 |
+
report = classification_report(true, pred, labels=class_labels, output_dict=True, zero_division=0)
|
69 |
+
rep = cls(
|
70 |
+
accuracy=report.pop('accuracy'),
|
71 |
+
confusion=confusion_matrix(true, pred, labels=class_labels),
|
72 |
+
macro=ClassificationScores.from_dict(report.pop('macro avg')),
|
73 |
+
weighted=ClassificationScores.from_dict(report.pop('weighted avg')),
|
74 |
+
label_scores={label: ClassificationScores.from_dict(scores) for label, scores in report.items()},
|
75 |
+
labels=list(class_labels)
|
76 |
+
)
|
77 |
+
if len(class_labels) == 2:
|
78 |
+
rep.f1 = f1_score(true, pred)
|
79 |
+
rep.recall = recall_score(true, pred)
|
80 |
+
return rep
|
81 |
+
|
82 |
+
@property
|
83 |
+
def df(self):
|
84 |
+
df_dict = {
|
85 |
+
'Accuracy': self.accuracy,
|
86 |
+
**{f'{score.title()} (macro)': getattr(self.macro, score) for score in self.macro.__annotations__ if score != 'support'},
|
87 |
+
}
|
88 |
+
df = pd.DataFrame([df_dict])
|
89 |
+
return df
|
90 |
+
|
91 |
+
|
92 |
+
|
93 |
+
|
94 |
+
@dataclass
|
95 |
+
class RegressionReport:
|
96 |
+
r2: float
|
97 |
+
rmse: float
|
98 |
+
labels: list = None # only for multivariate regression
|
99 |
+
label_scores: Dict[str, float] = None # only for multivariate regression
|
100 |
+
|
101 |
+
@classmethod
|
102 |
+
def make_report(cls, true:np.ndarray, pred:np.ndarray, labels=None) -> "RegressionReport":
|
103 |
+
report = cls(
|
104 |
+
r2=r2_score(true, pred),
|
105 |
+
rmse=mean_squared_error(true, pred, squared=False)
|
106 |
+
)
|
107 |
+
if len(true.shape) > 1 and true.shape[1] > 1:
|
108 |
+
report.labels = labels or list(range(true.shape[1]))
|
109 |
+
report.label_scores = {label: RegressionScores.make(true[:,i], pred[:,i]) for i,label in enumerate(report.labels)}
|
110 |
+
return report
|
111 |
+
|
112 |
+
@property
|
113 |
+
def is_multivariate(self):
|
114 |
+
return self.labels is not None
|
115 |
+
|
116 |
+
@property
|
117 |
+
def df(self):
|
118 |
+
df_dict = {
|
119 |
+
'R2 avg': self.r2,
|
120 |
+
'RMSE avg': self.rmse,
|
121 |
+
}
|
122 |
+
if self.is_multivariate:
|
123 |
+
df_dict.update({f'R2 {label}': scores.r2 for label, scores in self.label_scores.items()})
|
124 |
+
df_dict.update({f'RMSE {label}': scores.rmse for label, scores in self.label_scores.items()})
|
125 |
+
df = pd.DataFrame([df_dict])
|
126 |
+
rmse_cols = ['RMSE avg']
|
127 |
+
df = df.filter(items=['RMSE avg', 'Pearson avg'] + sorted(df.columns.difference(['Pearson avg', 'RMSE avg'])))
|
128 |
+
df.columns = df.columns.str.replace('\s(a|b|c|d)_', ' ', regex=True)
|
129 |
+
return df
|
130 |
+
|
131 |
+
|
132 |
+
@dataclass
|
133 |
+
class Results:
|
134 |
+
taska: ClassificationReport
|
135 |
+
taskb: RegressionReport
|
136 |
+
taskc: ClassificationReport
|
137 |
+
taskd: RegressionReport
|
138 |
+
|
139 |
+
|
140 |
+
def absolute_results(true_df:pd.DataFrame, pred_df:pd.DataFrame, tasks='abcd'):
|
141 |
+
task_reports = {}
|
142 |
+
for task in tasks:
|
143 |
+
true=true_df.filter(regex=f'^{task}_').sort_index(axis=1)
|
144 |
+
pred=pred_df.filter(regex=f'^{task}_').sort_index(axis=1)
|
145 |
+
if len(true.columns) == 0 or len(pred.columns) == 0:
|
146 |
+
task_reports['task'+task] = None
|
147 |
+
continue
|
148 |
+
if task in ['a', 'c']:
|
149 |
+
task_reports['task'+task] = ClassificationReport.make_report(
|
150 |
+
true=true.iloc[:,0].values,
|
151 |
+
pred=pred.iloc[:,0].values
|
152 |
+
)
|
153 |
+
else:
|
154 |
+
task_reports['task'+task] = RegressionReport.make_report(
|
155 |
+
true=true.values,
|
156 |
+
pred=pred.values,
|
157 |
+
labels=true.columns.tolist() if task == 'd' else None
|
158 |
+
)
|
159 |
+
return Results(**task_reports)
|
160 |
+
|
161 |
+
|
162 |
+
|
163 |
+
def estimators_eval(estimators:List[Tuple[str,Any]], score_func:Callable[[np.ndarray, np.ndarray], float]):
|
164 |
+
def fit_eval_estimators(X_train:np.ndarray, y_train:np.ndarray, X_test:np.ndarray, y_test:np.ndarray) -> dict:
|
165 |
+
estimator_scores = {}
|
166 |
+
for name, estimator in estimators:
|
167 |
+
estimator.fit(X_train, y_train)
|
168 |
+
y_pred = estimator.predict(X_test)
|
169 |
+
score = score_func(y_test, y_pred)#*(1.4*((y_train>th).sum()/(len(y_train)-1))) # weighted for class imbalance
|
170 |
+
print(f"\"{name}\" estimator score: {score:.4f}")
|
171 |
+
estimator_scores[name] = score
|
172 |
+
return estimator_scores
|
173 |
+
return fit_eval_estimators
|
174 |
+
|
175 |
+
|
176 |
+
def label_metrics(score_fun, y_true, y_pred):
|
177 |
+
if len(y_true.shape) > 1 and y_true.shape[1] > 1:
|
178 |
+
scores = []
|
179 |
+
for i in range(y_true.shape[1]):
|
180 |
+
scores.append(score_fun(y_true[:,i],y_pred[:,i]))
|
181 |
+
return scores
|
182 |
+
score = score_fun(y_true.ravel(), y_pred.ravel())
|
183 |
+
if isinstance(score, list):
|
184 |
+
return score
|
185 |
+
elif isinstance(score, np.ndarray):
|
186 |
+
return score.tolist()
|
187 |
+
else:
|
188 |
+
return [score]
|
189 |
+
|
190 |
+
def metrics_for_estimators(estimators, score_fun, X, y_true):
|
191 |
+
metrics = {}
|
192 |
+
for name, estimator in estimators:
|
193 |
+
y_pred = estimator.predict(X)
|
194 |
+
metrics[name] = label_metrics(score_fun, y_true, y_pred)
|
195 |
+
return metrics
|
src/models.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .roberta_regressor import RobertaRegressor
|
2 |
+
from .embeddings import EmbeddingsRegressor
|
3 |
+
|
4 |
+
|
5 |
+
class EmbeddingsSimpleRegressor
|
src/multiregression.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Tuple, Dict, Any, Union
|
2 |
+
from copy import deepcopy
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
import sklearn
|
6 |
+
from sklearn.base import BaseEstimator, RegressorMixin
|
7 |
+
from sklearn.multioutput import MultiOutputRegressor
|
8 |
+
from sklearn.pipeline import Pipeline
|
9 |
+
from sklearn.decomposition import PCA
|
10 |
+
from sklearn.preprocessing import StandardScaler
|
11 |
+
|
12 |
+
from . import utils
|
13 |
+
|
14 |
+
class RegChainWithPCA(BaseEstimator, RegressorMixin):
|
15 |
+
|
16 |
+
def __init__(
|
17 |
+
self,
|
18 |
+
base_regressor:sklearn.base.BaseEstimator,
|
19 |
+
num_components:float=0.97,
|
20 |
+
pca_exclude_first:bool=True,
|
21 |
+
**fit_params):
|
22 |
+
"""
|
23 |
+
This chain works like sklearn.multioutput.RegressorChain,
|
24 |
+
but applies PCA to reduce the dimensionality of the input data of the chain.
|
25 |
+
By default, the first target is excluded from the PCA transformation.
|
26 |
+
That is, it is fitted with the original input data while the rest of the targets
|
27 |
+
are fitted with the PCA-transformed input data.
|
28 |
+
|
29 |
+
Parameters
|
30 |
+
----------
|
31 |
+
|
32 |
+
base_regressor : sklearn.base.BaseEstimator
|
33 |
+
The base regressor to be used in the chain.
|
34 |
+
num_components : float, optional
|
35 |
+
The number of components to keep in the PCA transformation.
|
36 |
+
If float, it is the ratio of variance to be kept.
|
37 |
+
If int, it is the number of components to keep.
|
38 |
+
The default is 0.97.
|
39 |
+
pca_exclude_first : bool, optional
|
40 |
+
If True the first target is excluded from the PCA transformation.
|
41 |
+
If False all targets including the first are fitted with the PCA-transformed input data.
|
42 |
+
The default is True.
|
43 |
+
**fit_params :
|
44 |
+
Additional parameters to be passed to the fit method of the base regressor.
|
45 |
+
"""
|
46 |
+
self.base_regressor = base_regressor
|
47 |
+
self.num_components = num_components
|
48 |
+
self.pca_exclude_first = pca_exclude_first
|
49 |
+
self.estimators = None
|
50 |
+
self.pipes = None
|
51 |
+
self.fit_params = fit_params
|
52 |
+
|
53 |
+
def fit_pipe(self, X, num_components=None):
|
54 |
+
if num_components is None:
|
55 |
+
num_components = self.num_components
|
56 |
+
pipe = Pipeline([
|
57 |
+
('scaler', StandardScaler()),
|
58 |
+
('pca', PCA(n_components=self.num_components)),
|
59 |
+
])
|
60 |
+
pipe.fit(X)
|
61 |
+
self.pipe = pipe
|
62 |
+
return pipe
|
63 |
+
|
64 |
+
def fit(self, X, y, **fit_params):
|
65 |
+
fit_params_ = self.fit_params.copy()
|
66 |
+
fit_params_.update(fit_params)
|
67 |
+
pipe = self.fit_pipe(X)
|
68 |
+
Y_pred_chain = np.zeros((X.shape[0], y.shape[1]))
|
69 |
+
X_transformed = pipe.transform(X)
|
70 |
+
num_components_pca = X_transformed.shape[1]
|
71 |
+
X_aug = np.hstack((X_transformed, Y_pred_chain))
|
72 |
+
self.estimators = [deepcopy(self.base_regressor) for _ in range(y.shape[1])]
|
73 |
+
del Y_pred_chain, X_transformed
|
74 |
+
for idx, estimator in enumerate(self.estimators):
|
75 |
+
if idx == 0 and self.pca_exclude_first:
|
76 |
+
estimator.fit(X, y[:, idx], **fit_params_)
|
77 |
+
else:
|
78 |
+
estimator.fit(X_aug[:, : (num_components_pca + idx)], y[:, idx], **fit_params_)
|
79 |
+
if idx < y.shape[1] - 1:
|
80 |
+
if idx == 0 and self.pca_exclude_first:
|
81 |
+
X_aug[:, num_components_pca + idx] = estimator.predict(X)
|
82 |
+
else:
|
83 |
+
X_aug[:, num_components_pca + idx] = estimator.predict(X_aug[:, : (num_components_pca + idx)])
|
84 |
+
|
85 |
+
|
86 |
+
def predict(self, X):
|
87 |
+
Y_pred_chain = np.zeros((X.shape[0], len(self.estimators)))
|
88 |
+
X_transformed = self.pipe.transform(X)
|
89 |
+
X_aug = np.hstack((X_transformed, Y_pred_chain))
|
90 |
+
for idx, estimator in enumerate(self.estimators):
|
91 |
+
if idx == 0 and self.pca_exclude_first:
|
92 |
+
Y_pred_chain[:, idx] = estimator.predict(X)
|
93 |
+
else:
|
94 |
+
Y_pred_chain[:, idx] = estimator.predict(X_aug[:, : (X_transformed.shape[1] + idx)])
|
95 |
+
if idx < len(self.estimators) - 1:
|
96 |
+
X_aug[:, X_transformed.shape[1] + idx] = Y_pred_chain[:, idx]
|
97 |
+
return Y_pred_chain
|
98 |
+
|
99 |
+
def score(self, X, y):
|
100 |
+
return utils.comp_score(y, self.predict(X))
|
101 |
+
|
102 |
+
def get_params(self, deep=True):
|
103 |
+
return {
|
104 |
+
'base_regressor': self.base_regressor,
|
105 |
+
'num_components': self.num_components,
|
106 |
+
'pca_exclude_first': self.pca_exclude_first,
|
107 |
+
**self.fit_params
|
108 |
+
}
|
src/roberta_regressor.py
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Defines a wrapper class of RobertaPreTrainedModel model to do regression on text data.
|
3 |
+
Based on: https://www.kaggle.com/code/sumantindurkhya/bert-for-regression
|
4 |
+
"""
|
5 |
+
|
6 |
+
from typing import Optional, Tuple, Union
|
7 |
+
from tqdm import tqdm, trange
|
8 |
+
|
9 |
+
import numpy as np
|
10 |
+
import torch
|
11 |
+
import torch.nn.functional as F
|
12 |
+
import torch.utils.checkpoint
|
13 |
+
from torch import nn
|
14 |
+
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
15 |
+
|
16 |
+
from transformers import BertModel, BertPreTrainedModel, RobertaPreTrainedModel, RobertaModel
|
17 |
+
|
18 |
+
class RobertaRegressor(RobertaPreTrainedModel):
|
19 |
+
|
20 |
+
def __init__(self, config, num_outputs=1, dropout=0.1, freeze_bert=False):
|
21 |
+
super().__init__(config)
|
22 |
+
|
23 |
+
self.num_outputs = num_outputs
|
24 |
+
|
25 |
+
self.roberta = RobertaModel(config)
|
26 |
+
if freeze_bert:
|
27 |
+
# freeze the roberta parameters
|
28 |
+
for param in self.roberta.parameters():
|
29 |
+
param.requires_grad = False
|
30 |
+
self.classifier = nn.Linear(config.hidden_size, 128)
|
31 |
+
self.relu = nn.ReLU()
|
32 |
+
self.dropout = nn.Dropout(dropout)
|
33 |
+
self.tanh = nn.Tanh()
|
34 |
+
self.regressor = nn.Linear(128, num_outputs)
|
35 |
+
|
36 |
+
|
37 |
+
def forward(self, input_ids, attention_mask):
|
38 |
+
# forward pass of the model
|
39 |
+
base_out = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
|
40 |
+
logits = base_out.pooler_output
|
41 |
+
out = self.classifier(logits)
|
42 |
+
out = self.dropout(out)
|
43 |
+
out = self.relu(out)
|
44 |
+
out = self.tanh(out)
|
45 |
+
out = self.dropout(out)
|
46 |
+
out = self.regressor(out)
|
47 |
+
return out
|
48 |
+
|
49 |
+
def predict(self, text:str, tokenizer, device, numpy=True) -> Tuple[float, float, float, float]:
|
50 |
+
input_ids, attention_mask = tokenizer.encode_plus(text, padding=True, truncation=True, return_tensors='pt').values()
|
51 |
+
input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
|
52 |
+
output = self(input_ids, attention_mask).squeeze()
|
53 |
+
# free up memory
|
54 |
+
del input_ids, attention_mask
|
55 |
+
out = output.detach()
|
56 |
+
if numpy:
|
57 |
+
return out.cpu().numpy()
|
58 |
+
return out
|
59 |
+
|
60 |
+
|
61 |
+
class RobertaSeqMultiRegressor(RobertaPreTrainedModel):
|
62 |
+
"""
|
63 |
+
A wrapper class of RobertaPreTrainedModel model to do multi-output regression on text data.
|
64 |
+
This models the task of predicting multiple outputs from a single text input.
|
65 |
+
The problem is formulated in a sequential manner, where the model predicts the next output
|
66 |
+
conditioned on the previous outputs.
|
67 |
+
|
68 |
+
This approach is ideal for modeling problems where the outputs are correlated
|
69 |
+
such as probability distributions, where the sum of the outputs must be 1.
|
70 |
+
Or, for example, in the case of predicting the next word in a sentence, where the
|
71 |
+
model must predict the next word conditioned on the previous words.
|
72 |
+
|
73 |
+
The model is similar to the one described in the RobertaRegressor class, with the
|
74 |
+
exception that the head of the model is a sequential model, where the output of the
|
75 |
+
previous layer is fed as input to the next layer similar to how a RNN works.
|
76 |
+
"""
|
77 |
+
|
78 |
+
def __init__(self, config, num_outputs=1, dropout=0.1, freeze_bert=False):
|
79 |
+
super().__init__(config)
|
80 |
+
|
81 |
+
self.num_outputs = num_outputs
|
82 |
+
|
83 |
+
self.roberta = RobertaModel(config)
|
84 |
+
if freeze_bert:
|
85 |
+
# freeze the roberta parameters
|
86 |
+
for param in self.roberta.parameters():
|
87 |
+
param.requires_grad = False
|
88 |
+
# head of the model is a model that takes the output of the previous layer as input
|
89 |
+
# and outputs a single value until the number of outputs is reached
|
90 |
+
for i in range(num_outputs):
|
91 |
+
setattr(self, f"regressor_{i}", nn.Linear(config.hidden_size, 128))
|
92 |
+
self.relu = nn.ReLU()
|
93 |
+
self.dropout = nn.Dropout(dropout)
|
94 |
+
self.tanh = nn.Tanh()
|
95 |
+
|
96 |
+
def forward(self, input_ids, attention_mask):
|
97 |
+
# forward pass of the model
|
98 |
+
base_out = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
|
99 |
+
logits = base_out.pooler_output
|
100 |
+
outputs = []
|
101 |
+
for i in range(self.num_outputs):
|
102 |
+
out = getattr(self, f"regressor_{i}")(logits)
|
103 |
+
out = self.dropout(out)
|
104 |
+
out = self.relu(out)
|
105 |
+
out = self.tanh(out)
|
106 |
+
outputs.append(out)
|
107 |
+
return outputs
|
108 |
+
|
109 |
+
|
110 |
+
def sum_diff_loss(output, target):
|
111 |
+
return torch.sum(torch.abs(output - target))
|
112 |
+
|
113 |
+
def evaluate(model, criterion, dataloader, device, sum_diff_penalty=False):
|
114 |
+
model.eval()
|
115 |
+
mean_acc, mean_loss, count = 0, 0, 0
|
116 |
+
|
117 |
+
with torch.no_grad():
|
118 |
+
for input_ids, attention_mask, target in (dataloader):
|
119 |
+
|
120 |
+
input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
|
121 |
+
output = model(input_ids, attention_mask)
|
122 |
+
|
123 |
+
mean_loss += criterion(output.squeeze(), target.type_as(output)).item()
|
124 |
+
count += 1
|
125 |
+
|
126 |
+
return mean_loss/count
|
127 |
+
|
128 |
+
# def predict(model, dataloader, device):
|
129 |
+
# predicted_label = []
|
130 |
+
# actual_label = []
|
131 |
+
# with torch.no_grad():
|
132 |
+
# for input_ids, attention_mask, target in (dataloader):
|
133 |
+
|
134 |
+
# input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
|
135 |
+
# output = model(input_ids, attention_mask)
|
136 |
+
|
137 |
+
# predicted_label += output
|
138 |
+
# actual_label += target
|
139 |
+
|
140 |
+
# return predicted_label
|
141 |
+
|
142 |
+
def train(model, criterion, optimizer, train_loader, val_loader, epochs, device):
|
143 |
+
best_acc = 0
|
144 |
+
for epoch in trange(epochs, desc="Epoch"):
|
145 |
+
model.train()
|
146 |
+
train_loss = 0
|
147 |
+
for i, (input_ids, attention_mask, target) in enumerate(iterable=train_loader):
|
148 |
+
optimizer.zero_grad()
|
149 |
+
|
150 |
+
input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
|
151 |
+
|
152 |
+
output = model(input_ids=input_ids, attention_mask=attention_mask)
|
153 |
+
# out = model.classifier(output)
|
154 |
+
loss = criterion(output.squeeze(), target.type_as(output))
|
155 |
+
loss.backward()
|
156 |
+
optimizer.step()
|
157 |
+
|
158 |
+
train_loss += loss.item()
|
159 |
+
|
160 |
+
print(f"Training loss is {train_loss/len(train_loader)}")
|
161 |
+
val_loss = evaluate(model=model, criterion=criterion, dataloader=val_loader, device=device)
|
162 |
+
print("Epoch {} complete! Validation Loss : {}".format(epoch, val_loss))
|
163 |
+
|
164 |
+
def multi_reg_loss(loss='mse', sum_diff_penalty:float=0.0):
|
165 |
+
"""
|
166 |
+
A custom loss function that penalizes the sum of differences
|
167 |
+
between the predicted and actual values for multi-output regression.
|
168 |
+
This is done to guide the model to predict outputs where
|
169 |
+
sum(y_hat1, y_hat2, ...) = sum(y1, y2, ...)
|
170 |
+
|
171 |
+
e.g: in task d, we have that sum(label1, label2, label3, label4) = 1
|
172 |
+
since its a probability distribution.
|
173 |
+
|
174 |
+
Parameters
|
175 |
+
----------
|
176 |
+
loss : str, optional
|
177 |
+
The loss function to be used, by default 'mse'
|
178 |
+
Available options: 'mse' and 'cross_entropy'
|
179 |
+
for mean squared error and cross entropy loss respectively
|
180 |
+
sum_diff_penalty : float, optional
|
181 |
+
The penalty to be applied to the sum of differences between the predicted and actual values, by default 0.0 (no penalty)
|
182 |
+
"""
|
183 |
+
if loss == 'mse':
|
184 |
+
loss_func = F.mse_loss
|
185 |
+
elif loss == 'cross_entropy':
|
186 |
+
loss_func = F.cross_entropy
|
187 |
+
else:
|
188 |
+
raise ValueError("Invalid loss function. Available options: 'mse' and 'cross_entropy'")
|
189 |
+
def reg_loss(input, target):
|
190 |
+
# first compute the normal MSE loss
|
191 |
+
mse = loss_func(input, target)
|
192 |
+
# then penalize the sum of differences between the predicted and actual values
|
193 |
+
sum_diff = torch.square(torch.sum(input) - torch.sum(target))
|
194 |
+
return mse + sum_diff_penalty*sum_diff
|
195 |
+
return reg_loss
|
196 |
+
|
src/train.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
# Embeddings
|
4 |
+
from sentence_transformers import SentenceTransformer
|
5 |
+
|
6 |
+
# train a classifier on the embeddings for multiclass regression
|
7 |
+
from sklearn.model_selection import train_test_split
|
8 |
+
from sklearn.metrics import (
|
9 |
+
r2_score, mean_squared_error, # regression metrics
|
10 |
+
accuracy_score, f1_score, precision_score, recall_score # classification metrics
|
11 |
+
)
|
12 |
+
from sklearn.multioutput import MultiOutputRegressor, RegressorChain # for multiclass regression
|
13 |
+
|
14 |
+
# Estimators
|
15 |
+
from sklearn.ensemble import (
|
16 |
+
RandomForestRegressor,
|
17 |
+
RandomForestClassifier,
|
18 |
+
GradientBoostingRegressor,
|
19 |
+
GradientBoostingClassifier,
|
20 |
+
AdaBoostRegressor,
|
21 |
+
AdaBoostClassifier
|
22 |
+
)
|
23 |
+
from sklearn.linear_model import (
|
24 |
+
LinearRegression,
|
25 |
+
LogisticRegression,
|
26 |
+
Ridge,
|
27 |
+
Lasso
|
28 |
+
)
|
29 |
+
# other regressors
|
30 |
+
from sklearn.svm import SVR
|
31 |
+
from sklearn.neighbors import KNeighborsRegressor
|
32 |
+
from sklearn.neural_network import MLPRegressor
|
33 |
+
from sklearn.tree import DecisionTreeRegressor
|
34 |
+
|
35 |
+
from lightgbm import LGBMRegressor, LGBMClassifier
|
36 |
+
|
37 |
+
# type hinting
|
38 |
+
import os, json
|
39 |
+
from typing import List, Callable, Dict, Tuple, Any
|
40 |
+
|
41 |
+
# local imports
|
42 |
+
from src import data, utils
|
43 |
+
from src.embeddings import EmbeddingsRegressor
|
44 |
+
|
45 |
+
def comp_score(y_true:np.ndarray,y_pred:np.ndarray)->float:
|
46 |
+
"""
|
47 |
+
Metric for multiclass regression. Computes the average of the RMSE scores for each label.
|
48 |
+
"""
|
49 |
+
rmse_scores = []
|
50 |
+
for i in range(y_true.shape[1]):
|
51 |
+
rmse_scores.append(np.sqrt(mean_squared_error(y_true[:,i],y_pred[:,i])))
|
52 |
+
return np.mean(rmse_scores)
|
53 |
+
|
54 |
+
|
55 |
+
def estimators_eval(estimators:List[Tuple[str,Any]], score_func:Callable[[np.ndarray, np.ndarray], float]):
|
56 |
+
def fit_eval_estimators(X_train:np.ndarray, y_train:np.ndarray, X_test:np.ndarray, y_test:np.ndarray) -> dict:
|
57 |
+
estimator_scores = {}
|
58 |
+
for name, estimator in estimators:
|
59 |
+
estimator.fit(X_train, y_train)
|
60 |
+
y_pred = estimator.predict(X_test)
|
61 |
+
score = score_func(y_test, y_pred)#*(1.4*((y_train>th).sum()/(len(y_train)-1))) # weighted for class imbalance
|
62 |
+
print(f"\"{name}\" estimator score: {score:.4f}")
|
63 |
+
estimator_scores[name] = score
|
64 |
+
return estimator_scores
|
65 |
+
return fit_eval_estimators
|
66 |
+
|
67 |
+
|
68 |
+
def get_data():
|
69 |
+
# load the train and test data
|
70 |
+
train_data = data.load('train')
|
71 |
+
test_df = data.load('test')
|
72 |
+
# concat messages by subject id
|
73 |
+
train_data = data.concat_messages(train_data)
|
74 |
+
test_df = data.concat_messages(test_df)
|
75 |
+
|
76 |
+
# split into 15% of subject ids for validation
|
77 |
+
# get the classes as the argmax of the label probabilities to use them for stratification
|
78 |
+
subj_classes = train_data.set_index('subject_id').filter(regex='^d_')\
|
79 |
+
.apply(lambda x: x.argmax() if x[:-1].sum()<0.5 else x[:-1].argmax(), axis=1)\
|
80 |
+
.replace(dict(enumerate(train_data.filter(regex='^d_').columns)))
|
81 |
+
tr_subj_ids, val_subj_ids = train_test_split(subj_classes.index, test_size=0.15, random_state=42, stratify=subj_classes.values)
|
82 |
+
# split the train data into train and validation sets
|
83 |
+
val_df = train_data[train_data['subject_id'].isin(val_subj_ids)]
|
84 |
+
train_df = train_data[train_data['subject_id'].isin(tr_subj_ids)]
|
85 |
+
|
86 |
+
# augment the train data by taking only the first half of the messages
|
87 |
+
half_messages_df_train = train_df.assign(
|
88 |
+
message=lambda df: df['message'].apply(lambda x: ' | '.join(x.split(' | ')[:len(x.split(' | '))//2])),
|
89 |
+
# num_messages=lambda df: df['message'].apply(lambda x: len(x.split(' | ')))
|
90 |
+
)
|
91 |
+
train_df = pd.concat([train_df, half_messages_df_train], axis=0).sort_values('subject_id').reset_index(drop=True)
|
92 |
+
return train_df, val_df, test_df
|
src/utils.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Tuple
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
def print_messages(msgs:List[dict]):
|
6 |
+
"""
|
7 |
+
Print the messages of a subject
|
8 |
+
|
9 |
+
Messages are a list of dictionaries of the form: [{'id_message': {int_id}, 'message': '{str_message}', 'date': '{str_date}'}, ...]
|
10 |
+
and are attached to an specific subject.
|
11 |
+
"""
|
12 |
+
for message in msgs:
|
13 |
+
print(f"{message['date']} - {message['message']}")
|
14 |
+
|
15 |
+
def load_data(files, truth):
|
16 |
+
"""load all the data into a dataframe"""
|
17 |
+
import os, json
|
18 |
+
data = []
|
19 |
+
for f in files:
|
20 |
+
with open(f) as file:
|
21 |
+
msgs = json.load(file)
|
22 |
+
for msg in msgs:
|
23 |
+
data.append([os.path.basename(f).split('.')[0], msg['id_message'], msg['date'], msg['message']])
|
24 |
+
df = pd.DataFrame(data, columns=['subject_id', 'id_message', 'date', 'message'])
|
25 |
+
df = df.merge(truth, on='subject_id')
|
26 |
+
return df
|
27 |
+
|
28 |
+
|
29 |
+
def normalize(x, prob=True):
|
30 |
+
"""
|
31 |
+
Normalize a vector to [0,1] and sum 1 if prob=True
|
32 |
+
"""
|
33 |
+
x = x.reshape(-1,4)
|
34 |
+
# normalize to [0,1]
|
35 |
+
x = ((x - x.min(axis=1)[...,None])/(x.max(axis=1)[...,None] - x.min(axis=1)[...,None])).round(4)
|
36 |
+
if prob:
|
37 |
+
# normalize to sum 1
|
38 |
+
x = x/x.sum(axis=1)[...,None]
|
39 |
+
return x.round(4)
|
40 |
+
|
41 |
+
def label_metrics(score_fun, y_true, y_pred):
|
42 |
+
scores = []
|
43 |
+
for i in range(y_true.shape[1]):
|
44 |
+
scores.append(score_fun(y_true[:,i],y_pred[:,i]))
|
45 |
+
return scores
|
46 |
+
|
47 |
+
def make_predict(predict_fn, **kwargs):
|
48 |
+
def predict(msg):
|
49 |
+
pred = predict_fn(msg, **kwargs)
|
50 |
+
return pred
|
51 |
+
return predict
|
52 |
+
|
53 |
+
|
54 |
+
def comp_score(y_true:np.ndarray,y_pred:np.ndarray)->float:
|
55 |
+
"""
|
56 |
+
Metric for simple and multiclass regression. Computes the average of the RMSE scores for each label.
|
57 |
+
"""
|
58 |
+
from sklearn.metrics import mean_squared_error
|
59 |
+
rmse_scores = []
|
60 |
+
for i in range(y_true.shape[1]):
|
61 |
+
rmse_scores.append(np.sqrt(mean_squared_error(y_true[:,i],y_pred[:,i])))
|
62 |
+
return np.mean(rmse_scores)
|