simonsv commited on
Commit
1eba40c
1 Parent(s): dc7dab2

made simple functional streamlit app to host the model

Browse files
app.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pickle
3
+ import numpy as np
4
+ import os, glob, json, sys
5
+ import pickle
6
+ import pandas as pd
7
+ import numpy as np
8
+ from sentence_transformers import SentenceTransformer
9
+
10
+ from src import data, utils
11
+ from src.embeddings import EmbeddingsRegressor
12
+
13
+
14
+ # load the models
15
+ with open('models/2d_ridge_roberta-suicide-regchain-pca-final.pkl', 'rb') as f:
16
+ regressor = pickle.load(f)
17
+
18
+ model_name = 'hackathon-somos-nlp-2023/roberta-base-bne-finetuned-suicide-es'
19
+ tokenizer = SentenceTransformer(model_name)
20
+ model = EmbeddingsRegressor(tokenizer, regressor, normalize_output=True)
21
+ predict = utils.make_predict(model.predict)
22
+
23
+ # model_selector = st.sidebar.selectbox(
24
+ # 'Select model:',
25
+ # ['roberta', 'roberta_seq_multi', 'roberta_seq_multi_2']
26
+ # )
27
+
28
+ text_input = st.text_input('Enter your text here:')
29
+ if text_input:
30
+ prediction = predict([text_input]).tolist()
31
+ prediction = np.array(prediction).reshape(-1,4)
32
+ prediction = utils.normalize(prediction)
33
+ preds_df = data.make_task_labels_from_d(prediction, include_d=True).rename(
34
+ columns={c:'d_'+c.replace('+','_').replace('|','_') for c in data.task_d_cols}
35
+ )
36
+ preds_df['b_label'] = np.clip(preds_df['b_label'], 0, 1)
37
+ # show the dataframe
38
+ table = st.table(preds_df)
models/2d_ridge_roberta-suicide-regchain-pca-final.pkl ADDED
Binary file (154 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers
2
+ sentence-transformers
3
+ pandas
4
+ streamlit
5
+ scikit-learn>=1.2.1
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (172 Bytes). View file
 
src/__pycache__/class_eval.cpython-38.pyc ADDED
Binary file (15.7 kB). View file
 
src/__pycache__/data.cpython-38.pyc ADDED
Binary file (4.34 kB). View file
 
src/__pycache__/embeddings.cpython-38.pyc ADDED
Binary file (2.15 kB). View file
 
src/__pycache__/eval.cpython-38.pyc ADDED
Binary file (7.51 kB). View file
 
src/__pycache__/multiregression.cpython-38.pyc ADDED
Binary file (4.34 kB). View file
 
src/__pycache__/roberta_regressor.cpython-38.pyc ADDED
Binary file (6.73 kB). View file
 
src/__pycache__/utils.cpython-38.pyc ADDED
Binary file (2.52 kB). View file
 
src/berta_finetuning.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
+ from datasets import Dataset, load_dataset#, Features, Value, ClassLabe
4
+
5
+ ds = load_dataset('nlpUc3mStudents/mental-risk-c')
6
+ # to pandas
7
+ train_df = ds['train'].to_pandas()
8
+ test_df = ds['test'].to_pandas()
9
+ label_names = train_df.iloc[:,4:].columns.tolist()
10
+ # concat messages by subject id
11
+ train_by_subjectid = (
12
+ train_df.groupby('subject_id')
13
+ .agg({'message': lambda x: ' | '.join(x), **{col: 'first' for col in label_names}})
14
+ .reset_index()
15
+ # .assign(
16
+ # num_messages=lambda x: x.message.str.count('\|') + 1
17
+ # )
18
+ )
19
+ # back to datasets
20
+ train_df = Dataset.from_pandas(train_by_subjectid)
21
+
22
+ model_name= 'hackathon-somos-nlp-2023/roberta-base-bne-finetuned-suicide-es'
23
+
24
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
25
+ # this model is trained with 2 labels, yet we need 4, so we need to change the head
26
+ model = None
27
+
28
+
src/class_eval.py ADDED
@@ -0,0 +1,576 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #This file has been developed by the SINAI research group for its usage in the MentalRiskES evaluation campaign at IberLEF 2023.
2
+
3
+ # Required libraries
4
+ import pandas as pd
5
+ import numpy as np
6
+ import sklearn.metrics as metrics
7
+ from scipy.stats import pearsonr
8
+
9
+ # Read Gold labels for BinaryClassification
10
+ def read_qrels(qrels_file):
11
+ qrels={}
12
+ df_golden_truth = pd.read_csv(qrels_file)
13
+ for index, r in df_golden_truth.iterrows():
14
+ qrels[ r['Subject'] ] = int(r['label'])
15
+ print("\n"+str(len(qrels))+ " lines read in qrels file!\n\n")
16
+ return(qrels)
17
+
18
+ # Read Gold labels for Simple Regression
19
+ def read_qrels_regression(qrels_file):
20
+ qrels={}
21
+ df_golden_truth = pd.read_csv(qrels_file)
22
+ for index, r in df_golden_truth.iterrows():
23
+ qrels[ r['Subject'] ] = float(r['label'])
24
+ print("\n"+str(len(qrels))+ " lines read in qrels file!\n\n")
25
+ return(qrels)
26
+
27
+ # Read Gold labels for Multiclass classification
28
+ def read_qrels_multiclass(qrels_file):
29
+ qrels={}
30
+ qrels1 = {}
31
+ df_golden_truth = pd.read_csv(qrels_file)
32
+ for index, r in df_golden_truth.iterrows():
33
+ qrels1[ r['Subject'] ] = r['label']
34
+ if "suffer" in r['label']:
35
+ qrels[ r['Subject'] ] = 1
36
+ else:
37
+ qrels[ r['Subject'] ] = 0
38
+ print("\n"+str(len(qrels))+ " lines read in qrels file!\n\n")
39
+ return qrels, qrels1
40
+
41
+ # Read Gold labels for Multi-output regression
42
+ def read_qrels_multioutput(qrels_file):
43
+ qrels={}
44
+ df_golden_truth = pd.read_csv(qrels_file)
45
+ for index, r in df_golden_truth.iterrows():
46
+ qrels[ r['Subject'] ] = [r['suffer_in_favour'],r['suffer_against'],r['suffer_other'],r['control']]
47
+ print("\n"+str(len(qrels))+ " lines read in qrels file!\n\n")
48
+ return qrels
49
+
50
+ ###########################################################################
51
+ # Calculation of Binary classification metrics for Binary classification tasks
52
+ class BinaryClassification():
53
+ def __init__(self, task, data, qrels):
54
+ self.run_results = data
55
+ self.qrels_b = read_qrels(qrels)
56
+ self.task = task
57
+ pass
58
+
59
+ def penalty(self,delay):
60
+ if self.task == "1": # TCA
61
+ p = 0.0292 # trial
62
+ elif self.task == "2": # Depression
63
+ p = 0.0179 # trial
64
+ pen = -1.0 + 2.0/(1+np.exp(-p*(delay-1)))
65
+ return(pen)
66
+
67
+ def n_pos(self):
68
+ total_pos = 0
69
+ for key in self.qrels_b:
70
+ total_pos += self.qrels_b[key]
71
+ return(total_pos)
72
+
73
+ def eval_performance(self):
74
+ print("===================================================")
75
+ print("DECISION-BASED EVALUATION:")
76
+ self.run_results = self.run_results.sort_values(by=['nick'])
77
+ total_pos=self.n_pos()
78
+ erdes5 = np.zeros(len(self.run_results))
79
+ erdes30 = np.zeros(len(self.run_results))
80
+ erdes50 = np.zeros(len(self.run_results))
81
+ ierdes = 0
82
+ true_pos = 0
83
+ false_pos = 0
84
+ latency_tps = list()
85
+ penalty_tps = list()
86
+
87
+ # Latency-based metrics
88
+ for index, r in self.run_results.iterrows():
89
+ try:
90
+
91
+ if ( self.qrels_b[ r['nick'] ] == r['pred'] ):
92
+ if ( r['pred'] == 1 ):
93
+ true_pos+=1
94
+ erdes5[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 5.0)))
95
+ erdes30[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 30.0)))
96
+ erdes50[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 50.0)))
97
+ latency_tps.append(r["round"]+1)
98
+ penalty_tps.append(self.penalty(r["round"]+1))
99
+ else:
100
+ erdes5[ierdes]=0
101
+ erdes30[ierdes]=0
102
+ erdes50[ierdes]=0
103
+ else:
104
+ if ( r['pred'] == 1 ):
105
+ false_pos+=1
106
+ erdes5[ierdes]=float(total_pos)/float(len(self.qrels_b))
107
+ erdes50[ierdes]=float(total_pos)/float(len(self.qrels_b))
108
+ else:
109
+ erdes5[ierdes]=1
110
+ erdes30[ierdes]=1
111
+ erdes50[ierdes]=1
112
+ except KeyError:
113
+ print("User does not appear in the qrels:"+r['nick'])
114
+ ierdes+=1
115
+
116
+ _speed = 1-np.median(np.array(penalty_tps))
117
+ if true_pos != 0 :
118
+ precision = float(true_pos) / float(true_pos+false_pos)
119
+ recall = float(true_pos) / float(total_pos)
120
+ f1_erde = 2 * (precision * recall) / (precision + recall)
121
+ _latencyweightedF1 = f1_erde*_speed
122
+ else:
123
+ _latencyweightedF1 = 0
124
+ _speed = 0
125
+
126
+ y_true = self.run_results['pred'].tolist()
127
+ y_pred_b = list(self.qrels_b.values())
128
+
129
+ # Binary metrics
130
+ accuracy = metrics.accuracy_score(y_true, y_pred_b)
131
+ macro_precision = metrics.precision_score(y_true, y_pred_b, average='macro')
132
+ macro_recall = metrics.recall_score(y_true, y_pred_b, average='macro')
133
+ macro_f1 = metrics.f1_score(y_true, y_pred_b, average='macro')
134
+ micro_precision = metrics.precision_score(y_true, y_pred_b, average='micro')
135
+ micro_recall = metrics.recall_score(y_true, y_pred_b, average='micro')
136
+ micro_f1 = metrics.f1_score(y_true, y_pred_b, average='micro')
137
+
138
+ print("BINARY METRICS: =============================")
139
+ print("Accuracy:"+str(accuracy))
140
+ print("Macro precision:"+str(macro_precision))
141
+ print("Macro recall:"+str(macro_recall))
142
+ print("Macro f1:"+str(macro_f1))
143
+ print("Micro precision:"+str(micro_precision))
144
+ print("Micro recall:"+str(micro_recall))
145
+ print("Micro f1:"+str(micro_f1))
146
+
147
+ print("LATENCY-BASED METRICS: =============================")
148
+ print("ERDE_5:"+str(np.mean(erdes5)))
149
+ print("ERDE_50:"+str(np.mean(erdes50)))
150
+ print("Median latency:"+str(np.median(np.array(latency_tps))))
151
+ print("Speed:"+str(_speed))
152
+ print("latency-weightedF1:"+str(_latencyweightedF1))
153
+
154
+ return {'Accuracy': accuracy, 'Macro_P': macro_precision, 'Macro_R': macro_recall,'Macro_F1': macro_f1,'Micro_P': micro_precision, 'Micro_R': micro_recall,
155
+ 'Micro_F1': micro_f1, 'ERDE5':np.mean(erdes5),'ERDE30': np.mean(erdes30),'ERDE50': np.mean(erdes50), 'latencyTP': np.median(np.array(latency_tps)),
156
+ 'speed': _speed, 'latency-weightedF1': _latencyweightedF1}
157
+
158
+ # Calculation of P@10, P@20, P@30, P@50
159
+ def eval_performance_rank_based(self):
160
+ print("===================================================")
161
+ print("RANK-BASED EVALUATION:")
162
+ ranks_at=[1,50,75]
163
+ rank_dit = {}
164
+ for rank in ranks_at:
165
+ print("Analizing ranking at round "+str(rank))
166
+ rels_topk = [0,0,0,0]
167
+ self.run_results["label"] = self.qrels_b.values()
168
+ self.run_results = self.run_results.sort_values(by=['pred'],ascending=False)
169
+ i = 0
170
+ for index, r in self.run_results.iterrows():
171
+ if i<10:
172
+ if r["pred"] == r['label']:
173
+ rels_topk[0] += 1
174
+ rels_topk[1] += 1
175
+ rels_topk[2] += 1
176
+ rels_topk[3] += 1
177
+ elif i<20:
178
+ if r["pred"] == r['label']:
179
+ rels_topk[1] += 1
180
+ rels_topk[2] += 1
181
+ rels_topk[3] += 1
182
+ elif i<30:
183
+ if r["pred"] == r['label']:
184
+ rels_topk[2] += 1
185
+ rels_topk[3] += 1
186
+ elif i<50:
187
+ if r["pred"] == r['label']:
188
+ rels_topk[3] += 1
189
+ else:
190
+ break
191
+ i+=1
192
+ p10 = float(rels_topk[0])/10.0
193
+ p20 = float(rels_topk[1])/20.0
194
+ p30 = float(rels_topk[2])/30.0
195
+ p50 = float(rels_topk[3])/50.0
196
+
197
+ print("PRECISION AT K: =============================")
198
+ print("P@10:"+str(p10))
199
+ print("P@20:"+str(p20))
200
+ print("P@30:"+str(p30))
201
+ print("P@50:"+str(p50))
202
+ rank_dit[rank] = {"@10":p10,"@20":p20,"@30":p30,"@50":p50}
203
+ return rank_dit
204
+
205
+
206
+ #############################################################################################
207
+ # Calculation of Regression metrics for Simple regression tasks
208
+ class ClassRegressionEvaluation():
209
+ def __init__(self, task, data, qrels):
210
+ self.run_results = data
211
+ self.qrels = read_qrels_regression(qrels)
212
+ self.task = task
213
+
214
+ def eval_performance(self):
215
+ self.run_results = self.run_results.sort_values(by=['nick'])
216
+ y_true = self.run_results['pred'].tolist()
217
+
218
+ y_pred_r = list(self.qrels.values())
219
+
220
+ # Regression metrics
221
+ _rmse = metrics.mean_squared_error(y_true, y_pred_r, sample_weight=None, multioutput='raw_values', squared=False)[0]
222
+ _pearson = np.corrcoef(y_true, y_pred_r)
223
+ _pearson, _ = pearsonr(y_true, y_pred_r)
224
+
225
+ print("REGRESSION METRICS: =============================")
226
+ print("RMSE:"+str(_rmse))
227
+ print("Pearson correlation coefficient:"+str(_pearson))
228
+
229
+ return { 'RMSE:': _rmse, 'Pearson_coefficient': _pearson}
230
+
231
+ # Calculation of P@10, P@20, P@30, P@50
232
+ def eval_performance_rank_based(self):
233
+ print("===================================================")
234
+ print("RANK-BASED EVALUATION:")
235
+ ranks_at=[1,25,50,75]
236
+ rank_dit = {}
237
+ for rank in ranks_at:
238
+ print("Analizing ranking at round "+str(rank))
239
+ rels_topk = [0,0,0,0,0]
240
+ self.run_results_ = self.run_results[rank].sort_values(by=['nick'])
241
+ self.run_results_["label"] = self.qrels.values()
242
+ self.run_results_ = self.run_results_.sort_values(by=['pred'],ascending=False)
243
+ i = 0
244
+ for index, r in self.run_results_.iterrows():
245
+ if i<5:
246
+ if r["label"] == round(r["pred"],1):
247
+ rels_topk[0] += 1
248
+ rels_topk[1] += 1
249
+ rels_topk[2] += 1
250
+ rels_topk[3] += 1
251
+ rels_topk[4] += 1
252
+ elif i<10:
253
+ if r['label'] == round(r["pred"],1):
254
+ rels_topk[1] += 1
255
+ rels_topk[2] += 1
256
+ rels_topk[3] += 1
257
+ rels_topk[4] += 1
258
+ elif i<20:
259
+ if r['label'] == round(r["pred"],1):
260
+ rels_topk[2] += 1
261
+ rels_topk[3] += 1
262
+ rels_topk[4] += 1
263
+ elif i<30:
264
+ if r['label'] == round(r["pred"],1):
265
+ rels_topk[3] += 1
266
+ rels_topk[4] += 1
267
+ elif i<50:
268
+ if r['label'] == round(r["pred"],1):
269
+ rels_topk[4] += 1
270
+ else:
271
+ break
272
+ i+=1
273
+ p5 = float(rels_topk[0])/5.0
274
+ p10 = float(rels_topk[1])/10.0
275
+ p20 = float(rels_topk[2])/20.0
276
+ p30 = float(rels_topk[3])/30.0
277
+ p50 = float(rels_topk[4])/50.0
278
+
279
+ print("PRECISION AT K: =============================")
280
+ print("P@5:"+str(p5))
281
+ print("P@10:"+str(p10))
282
+ print("P@20:"+str(p20))
283
+ print("P@30:"+str(p30))
284
+ print("P@50:"+str(p50))
285
+ rank_dit[rank] = {"@5":p5,"@10":p10,"@20":p20,"@30":p30,"@50":p50}
286
+ return rank_dit
287
+
288
+
289
+ ############################################################################
290
+ # Calculation of Binary metrics for Multiclass classification tasks
291
+ class BinaryMultiClassification():
292
+ def __init__(self, task, data, qrels):
293
+ self.run_results = data
294
+ self.qrels_b, self.qrels_multiclass = read_qrels_multiclass(qrels)
295
+ self.task = task
296
+ pass
297
+
298
+ def penalty(self,delay):
299
+ if self.task == "1": # TCA
300
+ p = 0.0411 # test
301
+ p = 0.0292 # trial
302
+ elif self.task == "2": # Depression
303
+ p = 0.0326 # test
304
+ p = 0.0179 # trial
305
+ else: # Unkown
306
+ p = 0.0308 # test
307
+ pen = -1.0 + 2.0/(1+np.exp(-p*(delay-1)))
308
+ return(pen)
309
+
310
+ def n_pos(self):
311
+ total_pos = 0
312
+ for key in self.qrels_b:
313
+ total_pos += self.qrels_b[key]
314
+ return(total_pos)
315
+
316
+
317
+ def eval_performance(self):
318
+ print("===================================================")
319
+ print("DECISION-BASED EVALUATION:")
320
+ self.run_results = self.run_results.sort_values(by=['nick'])
321
+ total_pos=self.n_pos() # Total number of positive documents
322
+ erdes5 = np.zeros(len(self.run_results))
323
+ erdes30 = np.zeros(len(self.run_results))
324
+ erdes50 = np.zeros(len(self.run_results))
325
+ ierdes = 0
326
+ true_pos = 0
327
+ false_pos = 0
328
+ latency_tps = list()
329
+ penalty_tps = list()
330
+
331
+ for index, r in self.run_results.iterrows():
332
+ try:
333
+
334
+ if ( self.qrels_b[ r['nick'] ] == r['pred_b'] ):
335
+ if ( r['pred_b'] == 1 ):
336
+ true_pos+=1
337
+ erdes5[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 5.0)))
338
+ erdes30[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 30.0)))
339
+ erdes50[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 50.0)))
340
+ latency_tps.append(r["round"]+1)
341
+ penalty_tps.append(self.penalty(r["round"]+1))
342
+ else:
343
+ erdes5[ierdes]=0
344
+ erdes30[ierdes]=0
345
+ erdes50[ierdes]=0
346
+ else:
347
+ if ( r['pred_b'] == 1 ):
348
+ false_pos+=1
349
+ erdes5[ierdes]=float(total_pos)/float(len(self.qrels_b))
350
+ erdes30[ierdes]=float(total_pos)/float(len(self.qrels_b))
351
+ erdes50[ierdes]=float(total_pos)/float(len(self.qrels_b))
352
+ else:
353
+ erdes5[ierdes]=1
354
+ erdes30[ierdes]=1
355
+ erdes50[ierdes]=1
356
+ except KeyError:
357
+ print("User does not appear in the qrels:"+r['nick'])
358
+ ierdes+=1
359
+
360
+ _speed = 1-np.median(np.array(penalty_tps))
361
+ if true_pos != 0 :
362
+ precision = float(true_pos) / float(true_pos+false_pos)
363
+ recall = float(true_pos) / float(total_pos)
364
+ f1_erde = 2 * (precision * recall) / (precision + recall)
365
+ _latencyweightedF1 = f1_erde*_speed
366
+ else:
367
+ _latencyweightedF1 = 0
368
+ _speed = 0
369
+
370
+ y_true = self.run_results['pred'].tolist()
371
+ y_pred_b = list(self.qrels_multiclass.values())
372
+
373
+ # Binary metrics
374
+ accuracy = metrics.accuracy_score(y_true, y_pred_b)
375
+ macro_precision = metrics.precision_score(y_true, y_pred_b, average='macro')
376
+ macro_recall = metrics.recall_score(y_true, y_pred_b, average='macro')
377
+ macro_f1 = metrics.f1_score(y_true, y_pred_b, average='macro')
378
+ micro_precision = metrics.precision_score(y_true, y_pred_b, average='micro')
379
+ micro_recall = metrics.recall_score(y_true, y_pred_b, average='micro')
380
+ micro_f1 = metrics.f1_score(y_true, y_pred_b, average='micro')
381
+
382
+ print("BINARY METRICS: =============================")
383
+ print("Accuracy:"+str(accuracy))
384
+ print("Macro precision:"+str(macro_precision))
385
+ print("Macro recall:"+str(macro_recall))
386
+ print("Macro f1:"+str(macro_f1))
387
+ print("Micro precision:"+str(micro_precision))
388
+ print("Micro recall:"+str(micro_recall))
389
+ print("Micro f1:"+str(micro_f1))
390
+
391
+ print("LATENCY-BASED METRICS: =============================")
392
+ print("ERDE_5:"+str(np.mean(erdes5)))
393
+ print("ERDE_50:"+str(np.mean(erdes50)))
394
+ print("Median latency:"+str(np.median(np.array(latency_tps))))
395
+ print("Speed:"+str(_speed))
396
+ print("latency-weightedF1:"+str(_latencyweightedF1))
397
+
398
+ return {'Accuracy': accuracy, 'Macro_P': macro_precision, 'Macro_R': macro_recall,'Macro_F1': macro_f1,'Micro_P': micro_precision, 'Micro_R': micro_recall,
399
+ 'Micro_F1': micro_f1, 'ERDE5':np.mean(erdes5),'ERDE30':np.mean(erdes30),'ERDE50': np.mean(erdes50), 'latencyTP': np.median(np.array(latency_tps)),
400
+ 'speed': _speed, 'latency-weightedF1': _latencyweightedF1}
401
+
402
+ # Calculation of P@10, P@20, P@30, P@50
403
+ def eval_performance_rank_based(self):
404
+ print("===================================================")
405
+ print("PRECISION AT K - EVALUATION:")
406
+ ranks_at=[1,50,75]
407
+ rank_dit = {}
408
+ for rank in ranks_at:
409
+ print("Analizing ranking at round "+str(rank))
410
+ rels_topk = [0,0,0,0]
411
+ self.run_results["label"] = self.qrels_b.values()
412
+ self.run_results = self.run_results.sort_values(by=['pred_b'],ascending=False)
413
+ i = 0
414
+ for index, r in self.run_results.iterrows():
415
+ if i<10:
416
+ if r["pred_b"] == r['label']:
417
+ rels_topk[0] += 1
418
+ rels_topk[1] += 1
419
+ rels_topk[2] += 1
420
+ rels_topk[3] += 1
421
+ elif i<20:
422
+ if r["pred_b"] == r['label']:
423
+ rels_topk[1] += 1
424
+ rels_topk[2] += 1
425
+ rels_topk[3] += 1
426
+ elif i<30:
427
+ if r["pred_b"] == r['label']:
428
+ rels_topk[2] += 1
429
+ rels_topk[3] += 1
430
+ elif i<50:
431
+ if r["pred_b"] == r['label']:
432
+ rels_topk[3] += 1
433
+ else:
434
+ break
435
+ i+=1
436
+ p10 = float(rels_topk[0])/10.0
437
+ p20 = float(rels_topk[1])/20.0
438
+ p30 = float(rels_topk[2])/30.0
439
+ p50 = float(rels_topk[3])/50.0
440
+
441
+ print("PRECISION AT K: =============================")
442
+ print("P@10:"+str(p10))
443
+ print("P@20:"+str(p20))
444
+ print("P@30:"+str(p30))
445
+ print("P@50:"+str(p50))
446
+ rank_dit[rank] = {"@10":p10,"@20":p20,"@30":p30,"@50":p50}
447
+ return rank_dit
448
+
449
+
450
+ #######################################################################################
451
+ # Calculation of Regression metrics for Multi-output regression tasks
452
+ class ClassMultiRegressionEvaluation():
453
+
454
+ def __init__(self, task, data, qrels):
455
+ self.run_results = data
456
+ self.qrels = read_qrels_multioutput(qrels)
457
+ self.task = task
458
+
459
+ def eval_performance(self):
460
+ self.run_results = self.run_results.sort_values(by=['nick'])
461
+ y_true = self.run_results['pred'].tolist()
462
+ y_pred_r = list(self.qrels.values())
463
+
464
+ # Regression metrics
465
+ _rmse = metrics.mean_squared_error(y_true, y_pred_r, sample_weight=None, multioutput='raw_values', squared=False)[0]
466
+ _pearson_sf, _ = pearsonr([item[0] for item in y_true] , [item[0] for item in y_pred_r])
467
+ _pearson_sa, _ = pearsonr([item[1] for item in y_true] , [item[1] for item in y_pred_r])
468
+ _pearson_so, _ = pearsonr([item[2] for item in y_true] , [item[2] for item in y_pred_r])
469
+ _pearson_c, _ = pearsonr([item[3] for item in y_true] , [item[3] for item in y_pred_r])
470
+
471
+ print("REGRESSION METRICS: =============================")
472
+ print("RMSE:"+str(_rmse))
473
+ print("Pearson correlation coefficient:")
474
+ print("Pearson sf:"+str(_pearson_sf))
475
+ print("Pearson sa:"+str(_pearson_sa))
476
+ print("Pearson so:"+str(_pearson_so))
477
+ print("Pearson c:"+str(_pearson_c))
478
+ pearson = (_pearson_sf + _pearson_sa + _pearson_so + _pearson_c)/4
479
+ return { 'RMSE:': _rmse, 'Pearson_mean': pearson,'Pearson_sf': _pearson_sf, 'Pearson_sa': _pearson_sa,'Pearson_so': _pearson_so,'Pearson_c': _pearson_c}
480
+
481
+ # Calculation of P@10, P@20, P@30, P@50
482
+ def eval_performance_rank_based(self):
483
+ print("===================================================")
484
+ print("PRECISION AT - EVALUATION:")
485
+ ranks_at=[1,25,50,75]
486
+ rank_dit = {}
487
+ for rank in ranks_at:
488
+ print("Analizing ranking at round "+str(rank))
489
+ self.run_results_ = self.run_results[rank].sort_values(by=['nick'])
490
+ self.run_results_["label"] = self.qrels.values()
491
+ self.run_results_ = self.run_results_.sort_values(by=['pred'],ascending=False)
492
+ p5 = 0
493
+ p10 = 0
494
+ p20 = 0
495
+ p30 = 0
496
+ p50 = 0
497
+ for j in range(0,4):
498
+ rels_topk = [0,0,0,0,0]
499
+ i = 0
500
+ for index, r in self.run_results_.iterrows():
501
+ if i<5:
502
+ if r['label'][j] == round(r["pred"][j],1):
503
+ rels_topk[0] += 1
504
+ rels_topk[1] += 1
505
+ rels_topk[2] += 1
506
+ rels_topk[3] += 1
507
+ rels_topk[4] += 1
508
+ elif i<10:
509
+ if r['label'][j] == round(r["pred"][j],1):
510
+ rels_topk[0] += 1
511
+ rels_topk[1] += 1
512
+ rels_topk[2] += 1
513
+ rels_topk[3] += 1
514
+ elif i<20:
515
+ if r['label'][j] == round(r["pred"][j],1):
516
+ rels_topk[1] += 1
517
+ rels_topk[2] += 1
518
+ rels_topk[3] += 1
519
+ elif i<30:
520
+ if r['label'][j] == round(r["pred"][j],1):
521
+ rels_topk[2] += 1
522
+ rels_topk[3] += 1
523
+ elif i<50:
524
+ if r['label'][j] == round(r["pred"][j],1):
525
+ rels_topk[3] += 1
526
+ else:
527
+ break
528
+ i+=1
529
+ p5 += float(rels_topk[0])/5.0
530
+ p10 += float(rels_topk[0])/10.0
531
+ p20 += float(rels_topk[1])/20.0
532
+ p30 += float(rels_topk[2])/30.0
533
+ p50 += float(rels_topk[3])/50.0
534
+
535
+ print("PRECISION AT K: =============================")
536
+ print("P@5:"+str(p5/4))
537
+ print("P@10:"+str(p10/4))
538
+ print("P@20:"+str(p20/4))
539
+ print("P@30:"+str(p30/4))
540
+ print("P@50:"+str(p50/4))
541
+ rank_dit[rank] = {"@5":p5/4,"@10":p10/4,"@20":p20/4,"@30":p30/4,"@50":p50/4}
542
+ return rank_dit
543
+
544
+
545
+ # Class for calculating carbon emission values
546
+ class Emissions():
547
+ def __init__(self, emissions_run) -> None:
548
+ self.emissions_run = emissions_run
549
+ self.aux = {}
550
+ for key, value in emissions_run.items():
551
+ self.aux[key] = 0
552
+ pass
553
+
554
+ # Update of values after a prediction has been made
555
+ def update_emissions(self,emissions_round):
556
+ # The values are accumulated in each round, so the difference is calculated to know the values for that round only
557
+ for key, value in self.emissions_run.items():
558
+ if key not in ["cpu_count","gpu_count","cpu_model","gpu_model", "ram_total_size"]:
559
+ round_ = emissions_round[key] - self.aux[key]
560
+ self.emissions_run[key].append(round_)
561
+ self.aux[key] = emissions_round[key]
562
+
563
+ # Calculation of final values after all predictions have been made
564
+ def calculate_emissions(self):
565
+ dict_ = {}
566
+ for key, value in self.emissions_run.items():
567
+ # Non-numerical values
568
+ if key in ["cpu_count","gpu_count","cpu_model","gpu_model", "ram_total_size"]:
569
+ dict_[key] = self.emissions_run[key][0]
570
+ # Numerical values
571
+ else:
572
+ dict_[key+"_min"] = min(self.emissions_run[key])
573
+ dict_[key+"_max"] = max(self.emissions_run[key])
574
+ dict_[key+"_mean"] = sum(self.emissions_run[key])/len(self.emissions_run[key])
575
+ dict_[key+"_var"] = np.var(self.emissions_run[key])
576
+ return dict_
src/data.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests, os, glob
2
+ import pandas as pd
3
+ import numpy as np
4
+
5
+ train_dir = "./data/train"
6
+ test_dir = "./data/test"
7
+ truth_dir = "golden_truth"
8
+
9
+ def load(set_name:str='train', with_labels:bool=True) -> pd.DataFrame:
10
+ """
11
+ Load the data from the csv files
12
+ """
13
+ if set_name == 'train':
14
+ path = train_dir
15
+ elif set_name == 'test':
16
+ path = test_dir
17
+ else:
18
+ raise ValueError("set_name must be either 'train' or 'test'")
19
+ if not os.path.exists(path):
20
+ if set_name=="train":
21
+ df = get_train(with_labels=with_labels)
22
+ else:
23
+ df = get_test(with_labels=with_labels)
24
+ else:
25
+ data_files = glob.glob(os.path.join(path, '*.json'))
26
+ if with_labels:
27
+ truth_path = os.path.join(path, truth_dir, 'task2_gold_d.txt')
28
+ truth_df = pd.read_csv(truth_path).rename(
29
+ columns=lambda s: 'd_' + s if s != 'Subject' else 'subject_id'
30
+ )
31
+ else:
32
+ truth_df = None
33
+ df = load_from_files(data_files, truth=truth_df)
34
+ abc_labels_df = make_task_labels_from_d(df.filter(regex='^d_').values.astype(float))
35
+ df = pd.concat([df, abc_labels_df], axis=1)
36
+ return df
37
+
38
+ def concat_messages(df:pd.DataFrame, sep:str=' | ') -> pd.DataFrame:
39
+ """
40
+ Concatenate all the messages of a subject into a single message
41
+ """
42
+ df = (
43
+ df
44
+ .assign(date=lambda x: pd.to_datetime(x['date']))
45
+ .sort_values(['subject_id', 'date'], ascending=[True, True])
46
+ .groupby('subject_id')
47
+ .agg({
48
+ 'message': lambda x: sep.join(x),
49
+ 'round': 'last',
50
+ **{c: 'first' for c in df.columns.drop(['subject_id', 'message', 'round'])}
51
+ }).sort_index()
52
+ .reset_index()
53
+ )
54
+ return df
55
+
56
+ def load_from_files(files, truth=None):
57
+ """load all the data into a dataframe"""
58
+ import os, json
59
+ data = []
60
+ for f in files:
61
+ with open(f) as file:
62
+ msgs = json.load(file)
63
+ for msg in msgs:
64
+ data.append([
65
+ msg.get('nick',os.path.basename(f).split('.')[0]),
66
+ msg.get('round', -1),
67
+ msg['id_message'],
68
+ msg['date'],
69
+ msg['message']])
70
+ df = pd.DataFrame(data, columns=['subject_id', 'round', 'id_message', 'date', 'message'])
71
+ if truth is not None:
72
+ df = df.merge(truth, on='subject_id')
73
+ return df
74
+
75
+ def get_train(hf_token:str):
76
+ from datasets import load_dataset, Dataset
77
+ ds = load_dataset('nlpUc3mStudents/mental-risk-d')
78
+ train_df = ds['train'].to_pandas()
79
+ return train_df
80
+
81
+ def get_test(hf_token:str):
82
+ raise NotImplementedError("Test data is not available")
83
+
84
+
85
+ task_d_cols = ['suffer+in favour', 'suffer+against', 'suffer+other', 'control']
86
+
87
+ def make_task_labels_from_d(d_data:np.ndarray, include_d:bool=False) -> pd.DataFrame:
88
+ """
89
+ Get the labels of all other tasks from the labels of the d task
90
+ """
91
+ if isinstance(d_data, pd.DataFrame):
92
+ d_df = d_data.astype(float)
93
+ else:
94
+ d_df = pd.DataFrame(d_data, columns=task_d_cols).astype(float)
95
+ df = d_df.assign(
96
+ c_label = lambda df: df.iloc[:,:-1].apply(
97
+ lambda x: df.columns[np.argmax(x)] if sum(x)>=0.5 else 'control', axis=1
98
+ ),
99
+ a_label=lambda df: (df.c_label!='control').astype(int),
100
+ b_label = lambda df: df[task_d_cols[:-1]].sum(axis=1).round(2)
101
+ )
102
+ if not include_d:
103
+ df = df[['a_label', 'b_label', 'c_label']]
104
+ return df
src/embeddings.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple, Dict, Any, Union
2
+ import numpy as np
3
+ from sklearn.base import BaseEstimator, RegressorMixin
4
+ from sklearn.multioutput import MultiOutputRegressor
5
+ from sentence_transformers import SentenceTransformer
6
+ from transformers import AutoTokenizer
7
+ import sklearn
8
+ from sklearn.pipeline import Pipeline
9
+ from sklearn.decomposition import PCA
10
+ from sklearn.preprocessing import StandardScaler
11
+
12
+ from copy import deepcopy
13
+
14
+ from . import utils
15
+
16
+ class EmbeddingsRegressor(BaseEstimator, RegressorMixin):
17
+
18
+ def __init__(
19
+ self,
20
+ encoder: Union[SentenceTransformer, AutoTokenizer],
21
+ regressor: Union[MultiOutputRegressor, BaseEstimator],
22
+ normalize_output: bool = True,
23
+ verbose: bool = False
24
+ ):
25
+ self.encoder = encoder
26
+ self.regressor = regressor
27
+ self.normalize_output = normalize_output
28
+ self.encodings = None
29
+ self.verbose = verbose
30
+
31
+ def fit(self, X: List[str], y: List[Tuple[float, float, float, float]]) -> "EmbeddingsRegressor":
32
+ X = self.encoder.encode(X, show_progress_bar=self.verbose)
33
+ self.regressor.fit(X, y)
34
+ return self
35
+
36
+ def transform(self, X: List[str]) -> List[List[float]]:
37
+ X = self.encoder.encode(X, show_progress_bar=self.verbose)
38
+ self.encodings = X
39
+ return X
40
+
41
+ def predict(self, X: Union[List[str], np.array], encodings=False) -> Union[List[float],List[List[float]]]:
42
+ if not encodings:
43
+ X = self.encoder.encode(X, show_progress_bar=self.verbose)
44
+ self.encodings = X
45
+ pred = self.regressor.predict(X)
46
+ if self.normalize_output:
47
+ pred /= pred.sum(axis=1, keepdims=True)
48
+ return pred
49
+
src/eval.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Tuple, Any, Callable
2
+ from dataclasses import dataclass
3
+ import pandas as pd
4
+ import numpy as np
5
+ from sklearn.metrics import (
6
+ f1_score, accuracy_score, recall_score, confusion_matrix,
7
+ classification_report,
8
+ r2_score, mean_squared_error
9
+ )
10
+
11
+
12
+ @dataclass
13
+ class ClassificationScores:
14
+ precision: float
15
+ recall: float
16
+ f1: float
17
+ support: float = None
18
+
19
+ @classmethod
20
+ def from_dict(cls, d:Dict[str, float]) -> "ClassificationScores":
21
+ d = {k.split('-')[0]: v for k, v in d.items() if k.split('-')[0] in cls.__annotations__}
22
+ return cls(**d)
23
+
24
+ @dataclass
25
+ class RegressionScores:
26
+ r2: float
27
+ mse: float
28
+ rmse: float
29
+
30
+ @classmethod
31
+ def make(cls, true:np.ndarray, pred:np.ndarray) -> "RegressionScores":
32
+ return cls(
33
+ r2=r2_score(true, pred),
34
+ mse=mean_squared_error(true, pred),
35
+ rmse=mean_squared_error(true, pred, squared=False)
36
+ )
37
+
38
+ def __add__(self, other):
39
+ return RegressionScores(
40
+ r2=self.r2 + other.r2,
41
+ mse=self.mse + other.mse,
42
+ rmse=self.rmse + other.rmse
43
+ )
44
+
45
+ def __truediv__(self, other):
46
+ return RegressionScores(
47
+ r2=self.r2 / other,
48
+ mse=self.mse / other,
49
+ rmse=self.rmse / other
50
+ )
51
+
52
+
53
+ @dataclass
54
+ class ClassificationReport:
55
+ accuracy: float
56
+ confusion: np.ndarray
57
+ macro: ClassificationScores
58
+ weighted: ClassificationScores
59
+ labels: list
60
+ label_scores: Dict[str, ClassificationScores] # label -> ClassificationScores
61
+
62
+ f1: float = None # only for binary classification
63
+ recall: float = None # only for binary classification
64
+
65
+ @classmethod
66
+ def make_report(cls, true:np.ndarray, pred:np.ndarray) -> "ClassificationReport":
67
+ class_labels = np.unique(np.concatenate([true, pred]))
68
+ report = classification_report(true, pred, labels=class_labels, output_dict=True, zero_division=0)
69
+ rep = cls(
70
+ accuracy=report.pop('accuracy'),
71
+ confusion=confusion_matrix(true, pred, labels=class_labels),
72
+ macro=ClassificationScores.from_dict(report.pop('macro avg')),
73
+ weighted=ClassificationScores.from_dict(report.pop('weighted avg')),
74
+ label_scores={label: ClassificationScores.from_dict(scores) for label, scores in report.items()},
75
+ labels=list(class_labels)
76
+ )
77
+ if len(class_labels) == 2:
78
+ rep.f1 = f1_score(true, pred)
79
+ rep.recall = recall_score(true, pred)
80
+ return rep
81
+
82
+ @property
83
+ def df(self):
84
+ df_dict = {
85
+ 'Accuracy': self.accuracy,
86
+ **{f'{score.title()} (macro)': getattr(self.macro, score) for score in self.macro.__annotations__ if score != 'support'},
87
+ }
88
+ df = pd.DataFrame([df_dict])
89
+ return df
90
+
91
+
92
+
93
+
94
+ @dataclass
95
+ class RegressionReport:
96
+ r2: float
97
+ rmse: float
98
+ labels: list = None # only for multivariate regression
99
+ label_scores: Dict[str, float] = None # only for multivariate regression
100
+
101
+ @classmethod
102
+ def make_report(cls, true:np.ndarray, pred:np.ndarray, labels=None) -> "RegressionReport":
103
+ report = cls(
104
+ r2=r2_score(true, pred),
105
+ rmse=mean_squared_error(true, pred, squared=False)
106
+ )
107
+ if len(true.shape) > 1 and true.shape[1] > 1:
108
+ report.labels = labels or list(range(true.shape[1]))
109
+ report.label_scores = {label: RegressionScores.make(true[:,i], pred[:,i]) for i,label in enumerate(report.labels)}
110
+ return report
111
+
112
+ @property
113
+ def is_multivariate(self):
114
+ return self.labels is not None
115
+
116
+ @property
117
+ def df(self):
118
+ df_dict = {
119
+ 'R2 avg': self.r2,
120
+ 'RMSE avg': self.rmse,
121
+ }
122
+ if self.is_multivariate:
123
+ df_dict.update({f'R2 {label}': scores.r2 for label, scores in self.label_scores.items()})
124
+ df_dict.update({f'RMSE {label}': scores.rmse for label, scores in self.label_scores.items()})
125
+ df = pd.DataFrame([df_dict])
126
+ rmse_cols = ['RMSE avg']
127
+ df = df.filter(items=['RMSE avg', 'Pearson avg'] + sorted(df.columns.difference(['Pearson avg', 'RMSE avg'])))
128
+ df.columns = df.columns.str.replace('\s(a|b|c|d)_', ' ', regex=True)
129
+ return df
130
+
131
+
132
+ @dataclass
133
+ class Results:
134
+ taska: ClassificationReport
135
+ taskb: RegressionReport
136
+ taskc: ClassificationReport
137
+ taskd: RegressionReport
138
+
139
+
140
+ def absolute_results(true_df:pd.DataFrame, pred_df:pd.DataFrame, tasks='abcd'):
141
+ task_reports = {}
142
+ for task in tasks:
143
+ true=true_df.filter(regex=f'^{task}_').sort_index(axis=1)
144
+ pred=pred_df.filter(regex=f'^{task}_').sort_index(axis=1)
145
+ if len(true.columns) == 0 or len(pred.columns) == 0:
146
+ task_reports['task'+task] = None
147
+ continue
148
+ if task in ['a', 'c']:
149
+ task_reports['task'+task] = ClassificationReport.make_report(
150
+ true=true.iloc[:,0].values,
151
+ pred=pred.iloc[:,0].values
152
+ )
153
+ else:
154
+ task_reports['task'+task] = RegressionReport.make_report(
155
+ true=true.values,
156
+ pred=pred.values,
157
+ labels=true.columns.tolist() if task == 'd' else None
158
+ )
159
+ return Results(**task_reports)
160
+
161
+
162
+
163
+ def estimators_eval(estimators:List[Tuple[str,Any]], score_func:Callable[[np.ndarray, np.ndarray], float]):
164
+ def fit_eval_estimators(X_train:np.ndarray, y_train:np.ndarray, X_test:np.ndarray, y_test:np.ndarray) -> dict:
165
+ estimator_scores = {}
166
+ for name, estimator in estimators:
167
+ estimator.fit(X_train, y_train)
168
+ y_pred = estimator.predict(X_test)
169
+ score = score_func(y_test, y_pred)#*(1.4*((y_train>th).sum()/(len(y_train)-1))) # weighted for class imbalance
170
+ print(f"\"{name}\" estimator score: {score:.4f}")
171
+ estimator_scores[name] = score
172
+ return estimator_scores
173
+ return fit_eval_estimators
174
+
175
+
176
+ def label_metrics(score_fun, y_true, y_pred):
177
+ if len(y_true.shape) > 1 and y_true.shape[1] > 1:
178
+ scores = []
179
+ for i in range(y_true.shape[1]):
180
+ scores.append(score_fun(y_true[:,i],y_pred[:,i]))
181
+ return scores
182
+ score = score_fun(y_true.ravel(), y_pred.ravel())
183
+ if isinstance(score, list):
184
+ return score
185
+ elif isinstance(score, np.ndarray):
186
+ return score.tolist()
187
+ else:
188
+ return [score]
189
+
190
+ def metrics_for_estimators(estimators, score_fun, X, y_true):
191
+ metrics = {}
192
+ for name, estimator in estimators:
193
+ y_pred = estimator.predict(X)
194
+ metrics[name] = label_metrics(score_fun, y_true, y_pred)
195
+ return metrics
src/models.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .roberta_regressor import RobertaRegressor
2
+ from .embeddings import EmbeddingsRegressor
3
+
4
+
5
+ class EmbeddingsSimpleRegressor
src/multiregression.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple, Dict, Any, Union
2
+ from copy import deepcopy
3
+
4
+ import numpy as np
5
+ import sklearn
6
+ from sklearn.base import BaseEstimator, RegressorMixin
7
+ from sklearn.multioutput import MultiOutputRegressor
8
+ from sklearn.pipeline import Pipeline
9
+ from sklearn.decomposition import PCA
10
+ from sklearn.preprocessing import StandardScaler
11
+
12
+ from . import utils
13
+
14
+ class RegChainWithPCA(BaseEstimator, RegressorMixin):
15
+
16
+ def __init__(
17
+ self,
18
+ base_regressor:sklearn.base.BaseEstimator,
19
+ num_components:float=0.97,
20
+ pca_exclude_first:bool=True,
21
+ **fit_params):
22
+ """
23
+ This chain works like sklearn.multioutput.RegressorChain,
24
+ but applies PCA to reduce the dimensionality of the input data of the chain.
25
+ By default, the first target is excluded from the PCA transformation.
26
+ That is, it is fitted with the original input data while the rest of the targets
27
+ are fitted with the PCA-transformed input data.
28
+
29
+ Parameters
30
+ ----------
31
+
32
+ base_regressor : sklearn.base.BaseEstimator
33
+ The base regressor to be used in the chain.
34
+ num_components : float, optional
35
+ The number of components to keep in the PCA transformation.
36
+ If float, it is the ratio of variance to be kept.
37
+ If int, it is the number of components to keep.
38
+ The default is 0.97.
39
+ pca_exclude_first : bool, optional
40
+ If True the first target is excluded from the PCA transformation.
41
+ If False all targets including the first are fitted with the PCA-transformed input data.
42
+ The default is True.
43
+ **fit_params :
44
+ Additional parameters to be passed to the fit method of the base regressor.
45
+ """
46
+ self.base_regressor = base_regressor
47
+ self.num_components = num_components
48
+ self.pca_exclude_first = pca_exclude_first
49
+ self.estimators = None
50
+ self.pipes = None
51
+ self.fit_params = fit_params
52
+
53
+ def fit_pipe(self, X, num_components=None):
54
+ if num_components is None:
55
+ num_components = self.num_components
56
+ pipe = Pipeline([
57
+ ('scaler', StandardScaler()),
58
+ ('pca', PCA(n_components=self.num_components)),
59
+ ])
60
+ pipe.fit(X)
61
+ self.pipe = pipe
62
+ return pipe
63
+
64
+ def fit(self, X, y, **fit_params):
65
+ fit_params_ = self.fit_params.copy()
66
+ fit_params_.update(fit_params)
67
+ pipe = self.fit_pipe(X)
68
+ Y_pred_chain = np.zeros((X.shape[0], y.shape[1]))
69
+ X_transformed = pipe.transform(X)
70
+ num_components_pca = X_transformed.shape[1]
71
+ X_aug = np.hstack((X_transformed, Y_pred_chain))
72
+ self.estimators = [deepcopy(self.base_regressor) for _ in range(y.shape[1])]
73
+ del Y_pred_chain, X_transformed
74
+ for idx, estimator in enumerate(self.estimators):
75
+ if idx == 0 and self.pca_exclude_first:
76
+ estimator.fit(X, y[:, idx], **fit_params_)
77
+ else:
78
+ estimator.fit(X_aug[:, : (num_components_pca + idx)], y[:, idx], **fit_params_)
79
+ if idx < y.shape[1] - 1:
80
+ if idx == 0 and self.pca_exclude_first:
81
+ X_aug[:, num_components_pca + idx] = estimator.predict(X)
82
+ else:
83
+ X_aug[:, num_components_pca + idx] = estimator.predict(X_aug[:, : (num_components_pca + idx)])
84
+
85
+
86
+ def predict(self, X):
87
+ Y_pred_chain = np.zeros((X.shape[0], len(self.estimators)))
88
+ X_transformed = self.pipe.transform(X)
89
+ X_aug = np.hstack((X_transformed, Y_pred_chain))
90
+ for idx, estimator in enumerate(self.estimators):
91
+ if idx == 0 and self.pca_exclude_first:
92
+ Y_pred_chain[:, idx] = estimator.predict(X)
93
+ else:
94
+ Y_pred_chain[:, idx] = estimator.predict(X_aug[:, : (X_transformed.shape[1] + idx)])
95
+ if idx < len(self.estimators) - 1:
96
+ X_aug[:, X_transformed.shape[1] + idx] = Y_pred_chain[:, idx]
97
+ return Y_pred_chain
98
+
99
+ def score(self, X, y):
100
+ return utils.comp_score(y, self.predict(X))
101
+
102
+ def get_params(self, deep=True):
103
+ return {
104
+ 'base_regressor': self.base_regressor,
105
+ 'num_components': self.num_components,
106
+ 'pca_exclude_first': self.pca_exclude_first,
107
+ **self.fit_params
108
+ }
src/roberta_regressor.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Defines a wrapper class of RobertaPreTrainedModel model to do regression on text data.
3
+ Based on: https://www.kaggle.com/code/sumantindurkhya/bert-for-regression
4
+ """
5
+
6
+ from typing import Optional, Tuple, Union
7
+ from tqdm import tqdm, trange
8
+
9
+ import numpy as np
10
+ import torch
11
+ import torch.nn.functional as F
12
+ import torch.utils.checkpoint
13
+ from torch import nn
14
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
15
+
16
+ from transformers import BertModel, BertPreTrainedModel, RobertaPreTrainedModel, RobertaModel
17
+
18
+ class RobertaRegressor(RobertaPreTrainedModel):
19
+
20
+ def __init__(self, config, num_outputs=1, dropout=0.1, freeze_bert=False):
21
+ super().__init__(config)
22
+
23
+ self.num_outputs = num_outputs
24
+
25
+ self.roberta = RobertaModel(config)
26
+ if freeze_bert:
27
+ # freeze the roberta parameters
28
+ for param in self.roberta.parameters():
29
+ param.requires_grad = False
30
+ self.classifier = nn.Linear(config.hidden_size, 128)
31
+ self.relu = nn.ReLU()
32
+ self.dropout = nn.Dropout(dropout)
33
+ self.tanh = nn.Tanh()
34
+ self.regressor = nn.Linear(128, num_outputs)
35
+
36
+
37
+ def forward(self, input_ids, attention_mask):
38
+ # forward pass of the model
39
+ base_out = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
40
+ logits = base_out.pooler_output
41
+ out = self.classifier(logits)
42
+ out = self.dropout(out)
43
+ out = self.relu(out)
44
+ out = self.tanh(out)
45
+ out = self.dropout(out)
46
+ out = self.regressor(out)
47
+ return out
48
+
49
+ def predict(self, text:str, tokenizer, device, numpy=True) -> Tuple[float, float, float, float]:
50
+ input_ids, attention_mask = tokenizer.encode_plus(text, padding=True, truncation=True, return_tensors='pt').values()
51
+ input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
52
+ output = self(input_ids, attention_mask).squeeze()
53
+ # free up memory
54
+ del input_ids, attention_mask
55
+ out = output.detach()
56
+ if numpy:
57
+ return out.cpu().numpy()
58
+ return out
59
+
60
+
61
+ class RobertaSeqMultiRegressor(RobertaPreTrainedModel):
62
+ """
63
+ A wrapper class of RobertaPreTrainedModel model to do multi-output regression on text data.
64
+ This models the task of predicting multiple outputs from a single text input.
65
+ The problem is formulated in a sequential manner, where the model predicts the next output
66
+ conditioned on the previous outputs.
67
+
68
+ This approach is ideal for modeling problems where the outputs are correlated
69
+ such as probability distributions, where the sum of the outputs must be 1.
70
+ Or, for example, in the case of predicting the next word in a sentence, where the
71
+ model must predict the next word conditioned on the previous words.
72
+
73
+ The model is similar to the one described in the RobertaRegressor class, with the
74
+ exception that the head of the model is a sequential model, where the output of the
75
+ previous layer is fed as input to the next layer similar to how a RNN works.
76
+ """
77
+
78
+ def __init__(self, config, num_outputs=1, dropout=0.1, freeze_bert=False):
79
+ super().__init__(config)
80
+
81
+ self.num_outputs = num_outputs
82
+
83
+ self.roberta = RobertaModel(config)
84
+ if freeze_bert:
85
+ # freeze the roberta parameters
86
+ for param in self.roberta.parameters():
87
+ param.requires_grad = False
88
+ # head of the model is a model that takes the output of the previous layer as input
89
+ # and outputs a single value until the number of outputs is reached
90
+ for i in range(num_outputs):
91
+ setattr(self, f"regressor_{i}", nn.Linear(config.hidden_size, 128))
92
+ self.relu = nn.ReLU()
93
+ self.dropout = nn.Dropout(dropout)
94
+ self.tanh = nn.Tanh()
95
+
96
+ def forward(self, input_ids, attention_mask):
97
+ # forward pass of the model
98
+ base_out = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
99
+ logits = base_out.pooler_output
100
+ outputs = []
101
+ for i in range(self.num_outputs):
102
+ out = getattr(self, f"regressor_{i}")(logits)
103
+ out = self.dropout(out)
104
+ out = self.relu(out)
105
+ out = self.tanh(out)
106
+ outputs.append(out)
107
+ return outputs
108
+
109
+
110
+ def sum_diff_loss(output, target):
111
+ return torch.sum(torch.abs(output - target))
112
+
113
+ def evaluate(model, criterion, dataloader, device, sum_diff_penalty=False):
114
+ model.eval()
115
+ mean_acc, mean_loss, count = 0, 0, 0
116
+
117
+ with torch.no_grad():
118
+ for input_ids, attention_mask, target in (dataloader):
119
+
120
+ input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
121
+ output = model(input_ids, attention_mask)
122
+
123
+ mean_loss += criterion(output.squeeze(), target.type_as(output)).item()
124
+ count += 1
125
+
126
+ return mean_loss/count
127
+
128
+ # def predict(model, dataloader, device):
129
+ # predicted_label = []
130
+ # actual_label = []
131
+ # with torch.no_grad():
132
+ # for input_ids, attention_mask, target in (dataloader):
133
+
134
+ # input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
135
+ # output = model(input_ids, attention_mask)
136
+
137
+ # predicted_label += output
138
+ # actual_label += target
139
+
140
+ # return predicted_label
141
+
142
+ def train(model, criterion, optimizer, train_loader, val_loader, epochs, device):
143
+ best_acc = 0
144
+ for epoch in trange(epochs, desc="Epoch"):
145
+ model.train()
146
+ train_loss = 0
147
+ for i, (input_ids, attention_mask, target) in enumerate(iterable=train_loader):
148
+ optimizer.zero_grad()
149
+
150
+ input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
151
+
152
+ output = model(input_ids=input_ids, attention_mask=attention_mask)
153
+ # out = model.classifier(output)
154
+ loss = criterion(output.squeeze(), target.type_as(output))
155
+ loss.backward()
156
+ optimizer.step()
157
+
158
+ train_loss += loss.item()
159
+
160
+ print(f"Training loss is {train_loss/len(train_loader)}")
161
+ val_loss = evaluate(model=model, criterion=criterion, dataloader=val_loader, device=device)
162
+ print("Epoch {} complete! Validation Loss : {}".format(epoch, val_loss))
163
+
164
+ def multi_reg_loss(loss='mse', sum_diff_penalty:float=0.0):
165
+ """
166
+ A custom loss function that penalizes the sum of differences
167
+ between the predicted and actual values for multi-output regression.
168
+ This is done to guide the model to predict outputs where
169
+ sum(y_hat1, y_hat2, ...) = sum(y1, y2, ...)
170
+
171
+ e.g: in task d, we have that sum(label1, label2, label3, label4) = 1
172
+ since its a probability distribution.
173
+
174
+ Parameters
175
+ ----------
176
+ loss : str, optional
177
+ The loss function to be used, by default 'mse'
178
+ Available options: 'mse' and 'cross_entropy'
179
+ for mean squared error and cross entropy loss respectively
180
+ sum_diff_penalty : float, optional
181
+ The penalty to be applied to the sum of differences between the predicted and actual values, by default 0.0 (no penalty)
182
+ """
183
+ if loss == 'mse':
184
+ loss_func = F.mse_loss
185
+ elif loss == 'cross_entropy':
186
+ loss_func = F.cross_entropy
187
+ else:
188
+ raise ValueError("Invalid loss function. Available options: 'mse' and 'cross_entropy'")
189
+ def reg_loss(input, target):
190
+ # first compute the normal MSE loss
191
+ mse = loss_func(input, target)
192
+ # then penalize the sum of differences between the predicted and actual values
193
+ sum_diff = torch.square(torch.sum(input) - torch.sum(target))
194
+ return mse + sum_diff_penalty*sum_diff
195
+ return reg_loss
196
+
src/train.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ # Embeddings
4
+ from sentence_transformers import SentenceTransformer
5
+
6
+ # train a classifier on the embeddings for multiclass regression
7
+ from sklearn.model_selection import train_test_split
8
+ from sklearn.metrics import (
9
+ r2_score, mean_squared_error, # regression metrics
10
+ accuracy_score, f1_score, precision_score, recall_score # classification metrics
11
+ )
12
+ from sklearn.multioutput import MultiOutputRegressor, RegressorChain # for multiclass regression
13
+
14
+ # Estimators
15
+ from sklearn.ensemble import (
16
+ RandomForestRegressor,
17
+ RandomForestClassifier,
18
+ GradientBoostingRegressor,
19
+ GradientBoostingClassifier,
20
+ AdaBoostRegressor,
21
+ AdaBoostClassifier
22
+ )
23
+ from sklearn.linear_model import (
24
+ LinearRegression,
25
+ LogisticRegression,
26
+ Ridge,
27
+ Lasso
28
+ )
29
+ # other regressors
30
+ from sklearn.svm import SVR
31
+ from sklearn.neighbors import KNeighborsRegressor
32
+ from sklearn.neural_network import MLPRegressor
33
+ from sklearn.tree import DecisionTreeRegressor
34
+
35
+ from lightgbm import LGBMRegressor, LGBMClassifier
36
+
37
+ # type hinting
38
+ import os, json
39
+ from typing import List, Callable, Dict, Tuple, Any
40
+
41
+ # local imports
42
+ from src import data, utils
43
+ from src.embeddings import EmbeddingsRegressor
44
+
45
+ def comp_score(y_true:np.ndarray,y_pred:np.ndarray)->float:
46
+ """
47
+ Metric for multiclass regression. Computes the average of the RMSE scores for each label.
48
+ """
49
+ rmse_scores = []
50
+ for i in range(y_true.shape[1]):
51
+ rmse_scores.append(np.sqrt(mean_squared_error(y_true[:,i],y_pred[:,i])))
52
+ return np.mean(rmse_scores)
53
+
54
+
55
+ def estimators_eval(estimators:List[Tuple[str,Any]], score_func:Callable[[np.ndarray, np.ndarray], float]):
56
+ def fit_eval_estimators(X_train:np.ndarray, y_train:np.ndarray, X_test:np.ndarray, y_test:np.ndarray) -> dict:
57
+ estimator_scores = {}
58
+ for name, estimator in estimators:
59
+ estimator.fit(X_train, y_train)
60
+ y_pred = estimator.predict(X_test)
61
+ score = score_func(y_test, y_pred)#*(1.4*((y_train>th).sum()/(len(y_train)-1))) # weighted for class imbalance
62
+ print(f"\"{name}\" estimator score: {score:.4f}")
63
+ estimator_scores[name] = score
64
+ return estimator_scores
65
+ return fit_eval_estimators
66
+
67
+
68
+ def get_data():
69
+ # load the train and test data
70
+ train_data = data.load('train')
71
+ test_df = data.load('test')
72
+ # concat messages by subject id
73
+ train_data = data.concat_messages(train_data)
74
+ test_df = data.concat_messages(test_df)
75
+
76
+ # split into 15% of subject ids for validation
77
+ # get the classes as the argmax of the label probabilities to use them for stratification
78
+ subj_classes = train_data.set_index('subject_id').filter(regex='^d_')\
79
+ .apply(lambda x: x.argmax() if x[:-1].sum()<0.5 else x[:-1].argmax(), axis=1)\
80
+ .replace(dict(enumerate(train_data.filter(regex='^d_').columns)))
81
+ tr_subj_ids, val_subj_ids = train_test_split(subj_classes.index, test_size=0.15, random_state=42, stratify=subj_classes.values)
82
+ # split the train data into train and validation sets
83
+ val_df = train_data[train_data['subject_id'].isin(val_subj_ids)]
84
+ train_df = train_data[train_data['subject_id'].isin(tr_subj_ids)]
85
+
86
+ # augment the train data by taking only the first half of the messages
87
+ half_messages_df_train = train_df.assign(
88
+ message=lambda df: df['message'].apply(lambda x: ' | '.join(x.split(' | ')[:len(x.split(' | '))//2])),
89
+ # num_messages=lambda df: df['message'].apply(lambda x: len(x.split(' | ')))
90
+ )
91
+ train_df = pd.concat([train_df, half_messages_df_train], axis=0).sort_values('subject_id').reset_index(drop=True)
92
+ return train_df, val_df, test_df
src/utils.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+ def print_messages(msgs:List[dict]):
6
+ """
7
+ Print the messages of a subject
8
+
9
+ Messages are a list of dictionaries of the form: [{'id_message': {int_id}, 'message': '{str_message}', 'date': '{str_date}'}, ...]
10
+ and are attached to an specific subject.
11
+ """
12
+ for message in msgs:
13
+ print(f"{message['date']} - {message['message']}")
14
+
15
+ def load_data(files, truth):
16
+ """load all the data into a dataframe"""
17
+ import os, json
18
+ data = []
19
+ for f in files:
20
+ with open(f) as file:
21
+ msgs = json.load(file)
22
+ for msg in msgs:
23
+ data.append([os.path.basename(f).split('.')[0], msg['id_message'], msg['date'], msg['message']])
24
+ df = pd.DataFrame(data, columns=['subject_id', 'id_message', 'date', 'message'])
25
+ df = df.merge(truth, on='subject_id')
26
+ return df
27
+
28
+
29
+ def normalize(x, prob=True):
30
+ """
31
+ Normalize a vector to [0,1] and sum 1 if prob=True
32
+ """
33
+ x = x.reshape(-1,4)
34
+ # normalize to [0,1]
35
+ x = ((x - x.min(axis=1)[...,None])/(x.max(axis=1)[...,None] - x.min(axis=1)[...,None])).round(4)
36
+ if prob:
37
+ # normalize to sum 1
38
+ x = x/x.sum(axis=1)[...,None]
39
+ return x.round(4)
40
+
41
+ def label_metrics(score_fun, y_true, y_pred):
42
+ scores = []
43
+ for i in range(y_true.shape[1]):
44
+ scores.append(score_fun(y_true[:,i],y_pred[:,i]))
45
+ return scores
46
+
47
+ def make_predict(predict_fn, **kwargs):
48
+ def predict(msg):
49
+ pred = predict_fn(msg, **kwargs)
50
+ return pred
51
+ return predict
52
+
53
+
54
+ def comp_score(y_true:np.ndarray,y_pred:np.ndarray)->float:
55
+ """
56
+ Metric for simple and multiclass regression. Computes the average of the RMSE scores for each label.
57
+ """
58
+ from sklearn.metrics import mean_squared_error
59
+ rmse_scores = []
60
+ for i in range(y_true.shape[1]):
61
+ rmse_scores.append(np.sqrt(mean_squared_error(y_true[:,i],y_pred[:,i])))
62
+ return np.mean(rmse_scores)