Spaces:

nlpUc3mStudents
/

DepNLP-MentalRiskEs

Sleeping

App Files Files Community

simonsv commited on Jun 15, 2023

Commit

1eba40c

•

1 Parent(s): dc7dab2

made simple functional streamlit app to host the model

Browse files

Files changed (22) hide show

app.py +38 -0
models/2d_ridge_roberta-suicide-regchain-pca-final.pkl +0 -0
requirements.txt +5 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-38.pyc +0 -0
src/__pycache__/class_eval.cpython-38.pyc +0 -0
src/__pycache__/data.cpython-38.pyc +0 -0
src/__pycache__/embeddings.cpython-38.pyc +0 -0
src/__pycache__/eval.cpython-38.pyc +0 -0
src/__pycache__/multiregression.cpython-38.pyc +0 -0
src/__pycache__/roberta_regressor.cpython-38.pyc +0 -0
src/__pycache__/utils.cpython-38.pyc +0 -0
src/berta_finetuning.py +28 -0
src/class_eval.py +576 -0
src/data.py +104 -0
src/embeddings.py +49 -0
src/eval.py +195 -0
src/models.py +5 -0
src/multiregression.py +108 -0
src/roberta_regressor.py +196 -0
src/train.py +92 -0
src/utils.py +62 -0

app.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import streamlit as st
+import pickle
+import numpy as np
+import os, glob, json, sys
+import pickle
+import pandas as pd
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from src import data, utils
+from src.embeddings import EmbeddingsRegressor
+# load the models
+with open('models/2d_ridge_roberta-suicide-regchain-pca-final.pkl', 'rb') as f:
+    regressor = pickle.load(f)
+model_name = 'hackathon-somos-nlp-2023/roberta-base-bne-finetuned-suicide-es'
+tokenizer = SentenceTransformer(model_name)
+model = EmbeddingsRegressor(tokenizer, regressor, normalize_output=True)
+predict = utils.make_predict(model.predict)
+# model_selector = st.sidebar.selectbox(
+#     'Select model:',
+#     ['roberta', 'roberta_seq_multi', 'roberta_seq_multi_2']
+# )
+text_input = st.text_input('Enter your text here:')
+if text_input:
+    prediction = predict([text_input]).tolist()
+    prediction = np.array(prediction).reshape(-1,4)
+    prediction = utils.normalize(prediction)
+    preds_df = data.make_task_labels_from_d(prediction, include_d=True).rename(
+        columns={c:'d_'+c.replace('+','_').replace('|','_') for c in data.task_d_cols}
+    )
+    preds_df['b_label'] = np.clip(preds_df['b_label'], 0, 1)
+    # show the dataframe
+    table = st.table(preds_df)

models/2d_ridge_roberta-suicide-regchain-pca-final.pkl ADDED Viewed

Binary file (154 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+transformers
+sentence-transformers
+pandas
+streamlit
+scikit-learn>=1.2.1

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (172 Bytes). View file

src/__pycache__/class_eval.cpython-38.pyc ADDED Viewed

Binary file (15.7 kB). View file

src/__pycache__/data.cpython-38.pyc ADDED Viewed

Binary file (4.34 kB). View file

src/__pycache__/embeddings.cpython-38.pyc ADDED Viewed

Binary file (2.15 kB). View file

src/__pycache__/eval.cpython-38.pyc ADDED Viewed

Binary file (7.51 kB). View file

src/__pycache__/multiregression.cpython-38.pyc ADDED Viewed

Binary file (4.34 kB). View file

src/__pycache__/roberta_regressor.cpython-38.pyc ADDED Viewed

Binary file (6.73 kB). View file

src/__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (2.52 kB). View file

src/berta_finetuning.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from transformers import pipeline
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from datasets import Dataset, load_dataset#, Features, Value, ClassLabe
+ds = load_dataset('nlpUc3mStudents/mental-risk-c')
+# to pandas
+train_df = ds['train'].to_pandas()
+test_df = ds['test'].to_pandas()
+label_names = train_df.iloc[:,4:].columns.tolist()
+# concat messages by subject id
+train_by_subjectid = (
+    train_df.groupby('subject_id')
+    .agg({'message': lambda x: ' | '.join(x), **{col: 'first' for col in label_names}})
+    .reset_index()
+    # .assign(
+    #     num_messages=lambda x: x.message.str.count('\|') + 1
+    # )
+)
+# back to datasets
+train_df = Dataset.from_pandas(train_by_subjectid)
+model_name= 'hackathon-somos-nlp-2023/roberta-base-bne-finetuned-suicide-es'
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+# this model is trained with 2 labels, yet we need 4, so we need to change the head
+model = None

src/class_eval.py ADDED Viewed

	@@ -0,0 +1,576 @@

+#This file has been developed by the SINAI research group for its usage in the MentalRiskES evaluation campaign at IberLEF 2023.
+# Required libraries
+import pandas as pd
+import numpy as np
+import sklearn.metrics as metrics
+from scipy.stats import pearsonr
+# Read Gold labels for BinaryClassification
+def read_qrels(qrels_file):
+    qrels={}
+    df_golden_truth = pd.read_csv(qrels_file)
+    for index, r in df_golden_truth.iterrows():
+        qrels[ r['Subject'] ] = int(r['label'])
+    print("\n"+str(len(qrels))+ " lines read in qrels file!\n\n")
+    return(qrels)
+# Read Gold labels for Simple Regression
+def read_qrels_regression(qrels_file):
+    qrels={}
+    df_golden_truth = pd.read_csv(qrels_file)
+    for index, r in df_golden_truth.iterrows():
+        qrels[ r['Subject'] ] = float(r['label'])
+    print("\n"+str(len(qrels))+ " lines read in qrels file!\n\n")
+    return(qrels)
+# Read Gold labels for Multiclass classification
+def read_qrels_multiclass(qrels_file):
+    qrels={}
+    qrels1 = {}
+    df_golden_truth = pd.read_csv(qrels_file)
+    for index, r in df_golden_truth.iterrows():
+        qrels1[ r['Subject'] ] = r['label']
+        if "suffer" in r['label']:
+            qrels[ r['Subject'] ] = 1
+        else:
+            qrels[ r['Subject'] ] = 0
+    print("\n"+str(len(qrels))+ " lines read in qrels file!\n\n")
+    return qrels, qrels1
+# Read Gold labels for Multi-output regression
+def read_qrels_multioutput(qrels_file):
+    qrels={}
+    df_golden_truth = pd.read_csv(qrels_file)
+    for index, r in df_golden_truth.iterrows():
+        qrels[ r['Subject'] ] = [r['suffer_in_favour'],r['suffer_against'],r['suffer_other'],r['control']]
+    print("\n"+str(len(qrels))+ " lines read in qrels file!\n\n")
+    return qrels
+###########################################################################
+# Calculation of Binary classification metrics for Binary classification tasks
+class BinaryClassification():
+    def __init__(self, task, data, qrels):
+        self.run_results = data
+        self.qrels_b = read_qrels(qrels)
+        self.task = task
+    pass
+    def penalty(self,delay):
+        if self.task == "1": # TCA
+            p = 0.0292 # trial
+        elif self.task == "2": # Depression
+            p = 0.0179 # trial
+        pen = -1.0 + 2.0/(1+np.exp(-p*(delay-1)))
+        return(pen)
+    def n_pos(self):
+        total_pos = 0
+        for key in self.qrels_b:
+            total_pos += self.qrels_b[key]
+        return(total_pos)
+    def eval_performance(self):
+        print("===================================================")
+        print("DECISION-BASED EVALUATION:")
+        self.run_results = self.run_results.sort_values(by=['nick'])
+        total_pos=self.n_pos()
+        erdes5 = np.zeros(len(self.run_results))
+        erdes30 = np.zeros(len(self.run_results))
+        erdes50 = np.zeros(len(self.run_results))
+        ierdes = 0
+        true_pos = 0
+        false_pos = 0
+        latency_tps = list()
+        penalty_tps = list()
+        # Latency-based metrics
+        for index, r in self.run_results.iterrows():
+            try:
+                if ( self.qrels_b[ r['nick'] ] ==  r['pred'] ):
+                    if ( r['pred'] == 1 ):
+                        true_pos+=1
+                        erdes5[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 5.0)))
+                        erdes30[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 30.0)))
+                        erdes50[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 50.0)))
+                        latency_tps.append(r["round"]+1)
+                        penalty_tps.append(self.penalty(r["round"]+1))
+                    else:
+                        erdes5[ierdes]=0
+                        erdes30[ierdes]=0
+                        erdes50[ierdes]=0
+                else:
+                    if ( r['pred'] == 1 ):
+                        false_pos+=1
+                        erdes5[ierdes]=float(total_pos)/float(len(self.qrels_b))
+                        erdes50[ierdes]=float(total_pos)/float(len(self.qrels_b))
+                    else:
+                        erdes5[ierdes]=1
+                        erdes30[ierdes]=1
+                        erdes50[ierdes]=1
+            except KeyError:
+                print("User does not appear in the qrels:"+r['nick'])
+            ierdes+=1
+        _speed = 1-np.median(np.array(penalty_tps))
+        if true_pos != 0 :
+            precision = float(true_pos) / float(true_pos+false_pos)
+            recall = float(true_pos) / float(total_pos)
+            f1_erde = 2 * (precision * recall) / (precision + recall)
+            _latencyweightedF1 = f1_erde*_speed
+        else:
+            _latencyweightedF1 = 0
+            _speed = 0
+        y_true = self.run_results['pred'].tolist()
+        y_pred_b = list(self.qrels_b.values())
+        # Binary metrics
+        accuracy = metrics.accuracy_score(y_true, y_pred_b)
+        macro_precision = metrics.precision_score(y_true, y_pred_b, average='macro')
+        macro_recall = metrics.recall_score(y_true, y_pred_b, average='macro')
+        macro_f1 = metrics.f1_score(y_true, y_pred_b, average='macro')
+        micro_precision = metrics.precision_score(y_true, y_pred_b, average='micro')
+        micro_recall = metrics.recall_score(y_true, y_pred_b, average='micro')
+        micro_f1 = metrics.f1_score(y_true, y_pred_b, average='micro')
+        print("BINARY METRICS: =============================")
+        print("Accuracy:"+str(accuracy))
+        print("Macro precision:"+str(macro_precision))
+        print("Macro recall:"+str(macro_recall))
+        print("Macro f1:"+str(macro_f1))
+        print("Micro precision:"+str(micro_precision))
+        print("Micro recall:"+str(micro_recall))
+        print("Micro f1:"+str(micro_f1))
+        print("LATENCY-BASED METRICS: =============================")
+        print("ERDE_5:"+str(np.mean(erdes5)))
+        print("ERDE_50:"+str(np.mean(erdes50)))
+        print("Median latency:"+str(np.median(np.array(latency_tps))))
+        print("Speed:"+str(_speed))
+        print("latency-weightedF1:"+str(_latencyweightedF1))
+        return {'Accuracy': accuracy, 'Macro_P': macro_precision, 'Macro_R': macro_recall,'Macro_F1': macro_f1,'Micro_P': micro_precision, 'Micro_R': micro_recall,
+        'Micro_F1': micro_f1, 'ERDE5':np.mean(erdes5),'ERDE30': np.mean(erdes30),'ERDE50': np.mean(erdes50), 'latencyTP': np.median(np.array(latency_tps)),
+        'speed': _speed, 'latency-weightedF1': _latencyweightedF1}
+    # Calculation of P@10, P@20, P@30, P@50
+    def eval_performance_rank_based(self):
+        print("===================================================")
+        print("RANK-BASED EVALUATION:")
+        ranks_at=[1,50,75]
+        rank_dit = {}
+        for rank in ranks_at:
+            print("Analizing ranking at round "+str(rank))
+            rels_topk = [0,0,0,0]
+            self.run_results["label"] = self.qrels_b.values()
+            self.run_results = self.run_results.sort_values(by=['pred'],ascending=False)
+            i = 0
+            for index, r in self.run_results.iterrows():
+                if i<10:
+                    if r["pred"] == r['label']:
+                        rels_topk[0] += 1
+                        rels_topk[1] += 1
+                        rels_topk[2] += 1
+                        rels_topk[3] += 1
+                elif i<20:
+                    if r["pred"] == r['label']:
+                        rels_topk[1] += 1
+                        rels_topk[2] += 1
+                        rels_topk[3] += 1
+                elif i<30:
+                    if r["pred"] == r['label']:
+                        rels_topk[2] += 1
+                        rels_topk[3] += 1
+                elif i<50:
+                    if r["pred"] == r['label']:
+                        rels_topk[3] += 1
+                else:
+                    break
+                i+=1
+            p10 = float(rels_topk[0])/10.0
+            p20 = float(rels_topk[1])/20.0
+            p30 = float(rels_topk[2])/30.0
+            p50 = float(rels_topk[3])/50.0
+            print("PRECISION AT K: =============================")
+            print("P@10:"+str(p10))
+            print("P@20:"+str(p20))
+            print("P@30:"+str(p30))
+            print("P@50:"+str(p50))
+            rank_dit[rank] = {"@10":p10,"@20":p20,"@30":p30,"@50":p50}
+        return rank_dit
+#############################################################################################
+# Calculation of Regression metrics for Simple regression tasks
+class ClassRegressionEvaluation():
+    def __init__(self, task, data, qrels):
+        self.run_results = data
+        self.qrels = read_qrels_regression(qrels)
+        self.task = task
+    def eval_performance(self):
+        self.run_results = self.run_results.sort_values(by=['nick'])
+        y_true = self.run_results['pred'].tolist()
+        y_pred_r = list(self.qrels.values())
+        # Regression metrics
+        _rmse = metrics.mean_squared_error(y_true, y_pred_r, sample_weight=None, multioutput='raw_values', squared=False)[0]
+        _pearson = np.corrcoef(y_true, y_pred_r)
+        _pearson, _ = pearsonr(y_true, y_pred_r)
+        print("REGRESSION METRICS: =============================")
+        print("RMSE:"+str(_rmse))
+        print("Pearson correlation coefficient:"+str(_pearson))
+        return { 'RMSE:': _rmse, 'Pearson_coefficient': _pearson}
+    # Calculation of P@10, P@20, P@30, P@50
+    def eval_performance_rank_based(self):
+        print("===================================================")
+        print("RANK-BASED EVALUATION:")
+        ranks_at=[1,25,50,75]
+        rank_dit = {}
+        for rank in ranks_at:
+            print("Analizing ranking at round "+str(rank))
+            rels_topk = [0,0,0,0,0]
+            self.run_results_ = self.run_results[rank].sort_values(by=['nick'])
+            self.run_results_["label"] = self.qrels.values()
+            self.run_results_ = self.run_results_.sort_values(by=['pred'],ascending=False)
+            i = 0
+            for index, r in self.run_results_.iterrows():
+                if i<5:
+                    if r["label"] == round(r["pred"],1):
+                        rels_topk[0] += 1
+                        rels_topk[1] += 1
+                        rels_topk[2] += 1
+                        rels_topk[3] += 1
+                        rels_topk[4] += 1
+                elif i<10:
+                    if r['label'] == round(r["pred"],1):
+                        rels_topk[1] += 1
+                        rels_topk[2] += 1
+                        rels_topk[3] += 1
+                        rels_topk[4] += 1
+                elif i<20:
+                    if  r['label'] == round(r["pred"],1):
+                        rels_topk[2] += 1
+                        rels_topk[3] += 1
+                        rels_topk[4] += 1
+                elif i<30:
+                    if  r['label'] == round(r["pred"],1):
+                        rels_topk[3] += 1
+                        rels_topk[4] += 1
+                elif i<50:
+                    if  r['label'] == round(r["pred"],1):
+                        rels_topk[4] += 1
+                else:
+                    break
+                i+=1
+            p5 = float(rels_topk[0])/5.0
+            p10 = float(rels_topk[1])/10.0
+            p20 = float(rels_topk[2])/20.0
+            p30 = float(rels_topk[3])/30.0
+            p50 = float(rels_topk[4])/50.0
+            print("PRECISION AT K: =============================")
+            print("P@5:"+str(p5))
+            print("P@10:"+str(p10))
+            print("P@20:"+str(p20))
+            print("P@30:"+str(p30))
+            print("P@50:"+str(p50))
+            rank_dit[rank] = {"@5":p5,"@10":p10,"@20":p20,"@30":p30,"@50":p50}
+        return rank_dit
+############################################################################
+# Calculation of Binary metrics for Multiclass classification tasks
+class BinaryMultiClassification():
+    def __init__(self, task, data, qrels):
+        self.run_results = data
+        self.qrels_b, self.qrels_multiclass  = read_qrels_multiclass(qrels)
+        self.task = task
+    pass
+    def penalty(self,delay):
+        if self.task == "1": # TCA
+            p = 0.0411 # test
+            p = 0.0292 # trial
+        elif self.task == "2": # Depression
+            p = 0.0326 # test
+            p = 0.0179 # trial
+        else: # Unkown
+            p = 0.0308 # test
+        pen = -1.0 + 2.0/(1+np.exp(-p*(delay-1)))
+        return(pen)
+    def n_pos(self):
+        total_pos = 0
+        for key in self.qrels_b:
+            total_pos += self.qrels_b[key]
+        return(total_pos)
+    def eval_performance(self):
+        print("===================================================")
+        print("DECISION-BASED EVALUATION:")
+        self.run_results = self.run_results.sort_values(by=['nick'])
+        total_pos=self.n_pos() # Total number of positive documents
+        erdes5 = np.zeros(len(self.run_results))
+        erdes30 = np.zeros(len(self.run_results))
+        erdes50 = np.zeros(len(self.run_results))
+        ierdes = 0
+        true_pos = 0
+        false_pos = 0
+        latency_tps = list()
+        penalty_tps = list()
+        for index, r in self.run_results.iterrows():
+            try:
+                if ( self.qrels_b[ r['nick'] ] ==  r['pred_b'] ):
+                    if ( r['pred_b'] == 1 ):
+                        true_pos+=1
+                        erdes5[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 5.0)))
+                        erdes30[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 30.0)))
+                        erdes50[ierdes]=1.0 - (1.0/(1.0+np.exp( (r["round"]+1) - 50.0)))
+                        latency_tps.append(r["round"]+1)
+                        penalty_tps.append(self.penalty(r["round"]+1))
+                    else:
+                        erdes5[ierdes]=0
+                        erdes30[ierdes]=0
+                        erdes50[ierdes]=0
+                else:
+                    if ( r['pred_b'] == 1 ):
+                        false_pos+=1
+                        erdes5[ierdes]=float(total_pos)/float(len(self.qrels_b))
+                        erdes30[ierdes]=float(total_pos)/float(len(self.qrels_b))
+                        erdes50[ierdes]=float(total_pos)/float(len(self.qrels_b))
+                    else:
+                        erdes5[ierdes]=1
+                        erdes30[ierdes]=1
+                        erdes50[ierdes]=1
+            except KeyError:
+                print("User does not appear in the qrels:"+r['nick'])
+            ierdes+=1
+        _speed = 1-np.median(np.array(penalty_tps))
+        if true_pos != 0 :
+            precision = float(true_pos) / float(true_pos+false_pos)
+            recall = float(true_pos) / float(total_pos)
+            f1_erde = 2 * (precision * recall) / (precision + recall)
+            _latencyweightedF1 = f1_erde*_speed
+        else:
+            _latencyweightedF1 = 0
+            _speed = 0
+        y_true = self.run_results['pred'].tolist()
+        y_pred_b = list(self.qrels_multiclass.values())
+        # Binary metrics
+        accuracy = metrics.accuracy_score(y_true, y_pred_b)
+        macro_precision = metrics.precision_score(y_true, y_pred_b, average='macro')
+        macro_recall = metrics.recall_score(y_true, y_pred_b, average='macro')
+        macro_f1 = metrics.f1_score(y_true, y_pred_b, average='macro')
+        micro_precision = metrics.precision_score(y_true, y_pred_b, average='micro')
+        micro_recall = metrics.recall_score(y_true, y_pred_b, average='micro')
+        micro_f1 = metrics.f1_score(y_true, y_pred_b, average='micro')
+        print("BINARY METRICS: =============================")
+        print("Accuracy:"+str(accuracy))
+        print("Macro precision:"+str(macro_precision))
+        print("Macro recall:"+str(macro_recall))
+        print("Macro f1:"+str(macro_f1))
+        print("Micro precision:"+str(micro_precision))
+        print("Micro recall:"+str(micro_recall))
+        print("Micro f1:"+str(micro_f1))
+        print("LATENCY-BASED METRICS: =============================")
+        print("ERDE_5:"+str(np.mean(erdes5)))
+        print("ERDE_50:"+str(np.mean(erdes50)))
+        print("Median latency:"+str(np.median(np.array(latency_tps))))
+        print("Speed:"+str(_speed))
+        print("latency-weightedF1:"+str(_latencyweightedF1))
+        return {'Accuracy': accuracy, 'Macro_P': macro_precision, 'Macro_R': macro_recall,'Macro_F1': macro_f1,'Micro_P': micro_precision, 'Micro_R': micro_recall,
+        'Micro_F1': micro_f1, 'ERDE5':np.mean(erdes5),'ERDE30':np.mean(erdes30),'ERDE50': np.mean(erdes50), 'latencyTP': np.median(np.array(latency_tps)),
+        'speed': _speed, 'latency-weightedF1': _latencyweightedF1}
+    # Calculation of P@10, P@20, P@30, P@50
+    def eval_performance_rank_based(self):
+        print("===================================================")
+        print("PRECISION AT K - EVALUATION:")
+        ranks_at=[1,50,75]
+        rank_dit = {}
+        for rank in ranks_at:
+            print("Analizing ranking at round "+str(rank))
+            rels_topk = [0,0,0,0]
+            self.run_results["label"] = self.qrels_b.values()
+            self.run_results = self.run_results.sort_values(by=['pred_b'],ascending=False)
+            i = 0
+            for index, r in self.run_results.iterrows():
+                if i<10:
+                    if r["pred_b"] == r['label']:
+                        rels_topk[0] += 1
+                        rels_topk[1] += 1
+                        rels_topk[2] += 1
+                        rels_topk[3] += 1
+                elif i<20:
+                    if r["pred_b"] == r['label']:
+                        rels_topk[1] += 1
+                        rels_topk[2] += 1
+                        rels_topk[3] += 1
+                elif i<30:
+                    if r["pred_b"] == r['label']:
+                        rels_topk[2] += 1
+                        rels_topk[3] += 1
+                elif i<50:
+                    if r["pred_b"] == r['label']:
+                        rels_topk[3] += 1
+                else:
+                    break
+                i+=1
+            p10 = float(rels_topk[0])/10.0
+            p20 = float(rels_topk[1])/20.0
+            p30 = float(rels_topk[2])/30.0
+            p50 = float(rels_topk[3])/50.0
+            print("PRECISION AT K: =============================")
+            print("P@10:"+str(p10))
+            print("P@20:"+str(p20))
+            print("P@30:"+str(p30))
+            print("P@50:"+str(p50))
+            rank_dit[rank] = {"@10":p10,"@20":p20,"@30":p30,"@50":p50}
+        return rank_dit
+#######################################################################################
+# Calculation of Regression metrics for Multi-output regression tasks
+class ClassMultiRegressionEvaluation():
+    def __init__(self, task, data, qrels):
+        self.run_results = data
+        self.qrels = read_qrels_multioutput(qrels)
+        self.task = task
+    def eval_performance(self):
+        self.run_results = self.run_results.sort_values(by=['nick'])
+        y_true = self.run_results['pred'].tolist()
+        y_pred_r = list(self.qrels.values())
+        # Regression metrics
+        _rmse = metrics.mean_squared_error(y_true, y_pred_r, sample_weight=None, multioutput='raw_values', squared=False)[0]
+        _pearson_sf, _ = pearsonr([item[0] for item in y_true] , [item[0] for item in y_pred_r])
+        _pearson_sa, _ = pearsonr([item[1] for item in y_true] , [item[1] for item in y_pred_r])
+        _pearson_so, _ = pearsonr([item[2] for item in y_true] , [item[2] for item in y_pred_r])
+        _pearson_c, _ = pearsonr([item[3] for item in y_true] , [item[3] for item in y_pred_r])
+        print("REGRESSION METRICS: =============================")
+        print("RMSE:"+str(_rmse))
+        print("Pearson correlation coefficient:")
+        print("Pearson sf:"+str(_pearson_sf))
+        print("Pearson sa:"+str(_pearson_sa))
+        print("Pearson so:"+str(_pearson_so))
+        print("Pearson c:"+str(_pearson_c))
+        pearson = (_pearson_sf + _pearson_sa + _pearson_so + _pearson_c)/4
+        return { 'RMSE:': _rmse, 'Pearson_mean': pearson,'Pearson_sf': _pearson_sf, 'Pearson_sa': _pearson_sa,'Pearson_so': _pearson_so,'Pearson_c': _pearson_c}
+    # Calculation of P@10, P@20, P@30, P@50
+    def eval_performance_rank_based(self):
+        print("===================================================")
+        print("PRECISION AT - EVALUATION:")
+        ranks_at=[1,25,50,75]
+        rank_dit = {}
+        for rank in ranks_at:
+            print("Analizing ranking at round "+str(rank))
+            self.run_results_ = self.run_results[rank].sort_values(by=['nick'])
+            self.run_results_["label"] = self.qrels.values()
+            self.run_results_ = self.run_results_.sort_values(by=['pred'],ascending=False)
+            p5 = 0
+            p10 = 0
+            p20 = 0
+            p30 = 0
+            p50 = 0
+            for j in range(0,4):
+                rels_topk = [0,0,0,0,0]
+                i = 0
+                for index, r in self.run_results_.iterrows():
+                    if i<5:
+                        if r['label'][j] == round(r["pred"][j],1):
+                            rels_topk[0] += 1
+                            rels_topk[1] += 1
+                            rels_topk[2] += 1
+                            rels_topk[3] += 1
+                            rels_topk[4] += 1
+                    elif i<10:
+                        if r['label'][j] == round(r["pred"][j],1):
+                            rels_topk[0] += 1
+                            rels_topk[1] += 1
+                            rels_topk[2] += 1
+                            rels_topk[3] += 1
+                    elif i<20:
+                        if r['label'][j] == round(r["pred"][j],1):
+                            rels_topk[1] += 1
+                            rels_topk[2] += 1
+                            rels_topk[3] += 1
+                    elif i<30:
+                        if r['label'][j] == round(r["pred"][j],1):
+                            rels_topk[2] += 1
+                            rels_topk[3] += 1
+                    elif i<50:
+                        if r['label'][j] == round(r["pred"][j],1):
+                            rels_topk[3] += 1
+                    else:
+                        break
+                    i+=1
+                p5 += float(rels_topk[0])/5.0
+                p10 += float(rels_topk[0])/10.0
+                p20 += float(rels_topk[1])/20.0
+                p30 += float(rels_topk[2])/30.0
+                p50 += float(rels_topk[3])/50.0
+            print("PRECISION AT K: =============================")
+            print("P@5:"+str(p5/4))
+            print("P@10:"+str(p10/4))
+            print("P@20:"+str(p20/4))
+            print("P@30:"+str(p30/4))
+            print("P@50:"+str(p50/4))
+            rank_dit[rank] = {"@5":p5/4,"@10":p10/4,"@20":p20/4,"@30":p30/4,"@50":p50/4}
+        return rank_dit
+# Class for calculating carbon emission values
+class Emissions():
+    def __init__(self, emissions_run) -> None:
+        self.emissions_run = emissions_run
+        self.aux = {}
+        for key, value in emissions_run.items():
+            self.aux[key] = 0
+        pass
+    # Update of values after a prediction has been made
+    def update_emissions(self,emissions_round):
+        # The values are accumulated in each round, so the difference is calculated to know the values for that round only
+        for key, value in self.emissions_run.items():
+            if key not in ["cpu_count","gpu_count","cpu_model","gpu_model", "ram_total_size"]:
+                round_ = emissions_round[key] - self.aux[key]
+            self.emissions_run[key].append(round_)
+            self.aux[key] = emissions_round[key]
+    # Calculation of final values after all predictions have been made
+    def calculate_emissions(self):
+        dict_ = {}
+        for key, value in self.emissions_run.items():
+            # Non-numerical values
+            if key in ["cpu_count","gpu_count","cpu_model","gpu_model", "ram_total_size"]:
+                dict_[key] = self.emissions_run[key][0]
+            # Numerical values
+            else:
+                dict_[key+"_min"] = min(self.emissions_run[key])
+                dict_[key+"_max"] = max(self.emissions_run[key])
+                dict_[key+"_mean"] = sum(self.emissions_run[key])/len(self.emissions_run[key])
+                dict_[key+"_var"] = np.var(self.emissions_run[key])
+        return dict_

src/data.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import requests, os, glob
+import pandas as pd
+import numpy as np
+train_dir = "./data/train"
+test_dir = "./data/test"
+truth_dir = "golden_truth"
+def load(set_name:str='train', with_labels:bool=True) -> pd.DataFrame:
+    """
+    Load the data from the csv files
+    """
+    if set_name == 'train':
+        path = train_dir
+    elif set_name == 'test':
+        path = test_dir
+    else:
+        raise ValueError("set_name must be either 'train' or 'test'")
+    if not os.path.exists(path):
+        if set_name=="train":
+            df = get_train(with_labels=with_labels)
+        else:
+            df = get_test(with_labels=with_labels)
+    else:
+        data_files = glob.glob(os.path.join(path, '*.json'))
+        if with_labels:
+            truth_path = os.path.join(path, truth_dir, 'task2_gold_d.txt')
+            truth_df = pd.read_csv(truth_path).rename(
+                columns=lambda s: 'd_' + s if s != 'Subject' else 'subject_id'
+            )
+        else:
+            truth_df = None
+        df = load_from_files(data_files, truth=truth_df)
+        abc_labels_df = make_task_labels_from_d(df.filter(regex='^d_').values.astype(float))
+        df = pd.concat([df, abc_labels_df], axis=1)
+    return df
+def concat_messages(df:pd.DataFrame, sep:str=' | ') -> pd.DataFrame:
+    """
+    Concatenate all the messages of a subject into a single message
+    """
+    df = (
+        df
+        .assign(date=lambda x: pd.to_datetime(x['date']))
+        .sort_values(['subject_id', 'date'], ascending=[True, True])
+        .groupby('subject_id')
+        .agg({
+            'message': lambda x: sep.join(x),
+            'round': 'last',
+            **{c: 'first' for c in df.columns.drop(['subject_id', 'message', 'round'])}
+         }).sort_index()
+         .reset_index()
+    )
+    return df
+def load_from_files(files, truth=None):
+    """load all the data into a dataframe"""
+    import os, json
+    data = []
+    for f in files:
+        with open(f) as file:
+            msgs = json.load(file)
+            for msg in msgs:
+                data.append([
+                    msg.get('nick',os.path.basename(f).split('.')[0]),
+                    msg.get('round', -1),
+                    msg['id_message'],
+                    msg['date'],
+                    msg['message']])
+    df = pd.DataFrame(data, columns=['subject_id', 'round', 'id_message', 'date', 'message'])
+    if truth is not None:
+        df = df.merge(truth, on='subject_id')
+    return df
+def get_train(hf_token:str):
+    from datasets import load_dataset, Dataset
+    ds = load_dataset('nlpUc3mStudents/mental-risk-d')
+    train_df = ds['train'].to_pandas()
+    return train_df
+def get_test(hf_token:str):
+    raise NotImplementedError("Test data is not available")
+task_d_cols = ['suffer+in favour', 'suffer+against', 'suffer+other', 'control']
+def make_task_labels_from_d(d_data:np.ndarray, include_d:bool=False) -> pd.DataFrame:
+    """
+    Get the labels of all other tasks from the labels of the d task
+    """
+    if isinstance(d_data, pd.DataFrame):
+        d_df = d_data.astype(float)
+    else:
+        d_df = pd.DataFrame(d_data, columns=task_d_cols).astype(float)
+    df = d_df.assign(
+        c_label = lambda df: df.iloc[:,:-1].apply(
+            lambda x: df.columns[np.argmax(x)] if sum(x)>=0.5 else 'control', axis=1
+        ),
+        a_label=lambda df: (df.c_label!='control').astype(int),
+        b_label = lambda df: df[task_d_cols[:-1]].sum(axis=1).round(2)
+    )
+    if not include_d:
+        df = df[['a_label', 'b_label', 'c_label']]
+    return df

src/embeddings.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from typing import List, Tuple, Dict, Any, Union
+import numpy as np
+from sklearn.base import BaseEstimator, RegressorMixin
+from sklearn.multioutput import MultiOutputRegressor
+from sentence_transformers import SentenceTransformer
+from transformers import AutoTokenizer
+import sklearn
+from sklearn.pipeline import Pipeline
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+from copy import deepcopy
+from . import utils
+class EmbeddingsRegressor(BaseEstimator, RegressorMixin):
+    def __init__(
+            self,
+            encoder: Union[SentenceTransformer, AutoTokenizer],
+            regressor: Union[MultiOutputRegressor, BaseEstimator],
+            normalize_output: bool = True,
+            verbose: bool = False
+        ):
+        self.encoder = encoder
+        self.regressor = regressor
+        self.normalize_output = normalize_output
+        self.encodings = None
+        self.verbose = verbose
+    def fit(self, X: List[str], y: List[Tuple[float, float, float, float]]) -> "EmbeddingsRegressor":
+        X = self.encoder.encode(X, show_progress_bar=self.verbose)
+        self.regressor.fit(X, y)
+        return self
+    def transform(self, X: List[str]) -> List[List[float]]:
+        X = self.encoder.encode(X, show_progress_bar=self.verbose)
+        self.encodings = X
+        return X
+    def predict(self, X: Union[List[str], np.array], encodings=False) -> Union[List[float],List[List[float]]]:
+        if not encodings:
+            X = self.encoder.encode(X, show_progress_bar=self.verbose)
+        self.encodings = X
+        pred = self.regressor.predict(X)
+        if self.normalize_output:
+            pred /= pred.sum(axis=1, keepdims=True)
+        return pred

src/eval.py ADDED Viewed

	@@ -0,0 +1,195 @@

+from typing import Dict, List, Tuple, Any, Callable
+from dataclasses import dataclass
+import pandas as pd
+import numpy as np
+from sklearn.metrics import (
+    f1_score, accuracy_score, recall_score, confusion_matrix,
+    classification_report,
+    r2_score, mean_squared_error
+)
+@dataclass
+class ClassificationScores:
+    precision: float
+    recall: float
+    f1: float
+    support: float = None
+    @classmethod
+    def from_dict(cls, d:Dict[str, float]) -> "ClassificationScores":
+        d = {k.split('-')[0]: v for k, v in d.items() if k.split('-')[0] in cls.__annotations__}
+        return cls(**d)
+@dataclass
+class RegressionScores:
+    r2: float
+    mse: float
+    rmse: float
+    @classmethod
+    def make(cls, true:np.ndarray, pred:np.ndarray) -> "RegressionScores":
+        return cls(
+            r2=r2_score(true, pred),
+            mse=mean_squared_error(true, pred),
+            rmse=mean_squared_error(true, pred, squared=False)
+        )
+    def __add__(self, other):
+        return RegressionScores(
+            r2=self.r2 + other.r2,
+            mse=self.mse + other.mse,
+            rmse=self.rmse + other.rmse
+        )
+    def __truediv__(self, other):
+        return RegressionScores(
+            r2=self.r2 / other,
+            mse=self.mse / other,
+            rmse=self.rmse / other
+        )
+@dataclass
+class ClassificationReport:
+    accuracy: float
+    confusion: np.ndarray
+    macro: ClassificationScores
+    weighted: ClassificationScores
+    labels: list
+    label_scores: Dict[str, ClassificationScores] # label -> ClassificationScores
+    f1: float = None # only for binary classification
+    recall: float = None # only for binary classification
+    @classmethod
+    def make_report(cls, true:np.ndarray, pred:np.ndarray) -> "ClassificationReport":
+        class_labels = np.unique(np.concatenate([true, pred]))
+        report = classification_report(true, pred, labels=class_labels, output_dict=True, zero_division=0)
+        rep = cls(
+            accuracy=report.pop('accuracy'),
+            confusion=confusion_matrix(true, pred, labels=class_labels),
+            macro=ClassificationScores.from_dict(report.pop('macro avg')),
+            weighted=ClassificationScores.from_dict(report.pop('weighted avg')),
+            label_scores={label: ClassificationScores.from_dict(scores) for label, scores in report.items()},
+            labels=list(class_labels)
+        )
+        if len(class_labels) == 2:
+            rep.f1 = f1_score(true, pred)
+            rep.recall = recall_score(true, pred)
+        return rep
+    @property
+    def df(self):
+        df_dict = {
+            'Accuracy': self.accuracy,
+            **{f'{score.title()} (macro)': getattr(self.macro, score) for score in self.macro.__annotations__ if score != 'support'},
+        }
+        df = pd.DataFrame([df_dict])
+        return df
+@dataclass
+class RegressionReport:
+    r2: float
+    rmse: float
+    labels: list = None # only for multivariate regression
+    label_scores: Dict[str, float] = None # only for multivariate regression
+    @classmethod
+    def make_report(cls, true:np.ndarray, pred:np.ndarray, labels=None) -> "RegressionReport":
+        report = cls(
+            r2=r2_score(true, pred),
+            rmse=mean_squared_error(true, pred, squared=False)
+        )
+        if len(true.shape) > 1 and true.shape[1] > 1:
+            report.labels = labels or list(range(true.shape[1]))
+            report.label_scores = {label: RegressionScores.make(true[:,i], pred[:,i]) for i,label in enumerate(report.labels)}
+        return report
+    @property
+    def is_multivariate(self):
+        return self.labels is not None
+    @property
+    def df(self):
+        df_dict = {
+            'R2 avg': self.r2,
+            'RMSE avg': self.rmse,
+        }
+        if self.is_multivariate:
+            df_dict.update({f'R2 {label}': scores.r2 for label, scores in self.label_scores.items()})
+            df_dict.update({f'RMSE {label}': scores.rmse for label, scores in self.label_scores.items()})
+        df = pd.DataFrame([df_dict])
+        rmse_cols = ['RMSE avg']
+        df = df.filter(items=['RMSE avg', 'Pearson avg'] + sorted(df.columns.difference(['Pearson avg', 'RMSE avg'])))
+        df.columns = df.columns.str.replace('\s(a|b|c|d)_', ' ', regex=True)
+        return df
+@dataclass
+class Results:
+    taska: ClassificationReport
+    taskb: RegressionReport
+    taskc: ClassificationReport
+    taskd: RegressionReport
+def absolute_results(true_df:pd.DataFrame, pred_df:pd.DataFrame, tasks='abcd'):
+    task_reports = {}
+    for task in tasks:
+        true=true_df.filter(regex=f'^{task}_').sort_index(axis=1)
+        pred=pred_df.filter(regex=f'^{task}_').sort_index(axis=1)
+        if len(true.columns) == 0 or len(pred.columns) == 0:
+            task_reports['task'+task] = None
+            continue
+        if task in ['a', 'c']:
+            task_reports['task'+task] = ClassificationReport.make_report(
+                true=true.iloc[:,0].values,
+                pred=pred.iloc[:,0].values
+            )
+        else:
+            task_reports['task'+task] = RegressionReport.make_report(
+                true=true.values,
+                pred=pred.values,
+                labels=true.columns.tolist() if task == 'd' else None
+            )
+    return Results(**task_reports)
+def estimators_eval(estimators:List[Tuple[str,Any]], score_func:Callable[[np.ndarray, np.ndarray], float]):
+    def fit_eval_estimators(X_train:np.ndarray, y_train:np.ndarray, X_test:np.ndarray, y_test:np.ndarray) -> dict:
+        estimator_scores = {}
+        for name, estimator in estimators:
+            estimator.fit(X_train, y_train)
+            y_pred = estimator.predict(X_test)
+            score = score_func(y_test, y_pred)#*(1.4*((y_train>th).sum()/(len(y_train)-1))) # weighted for class imbalance
+            print(f"\"{name}\" estimator score: {score:.4f}")
+            estimator_scores[name] = score
+        return estimator_scores
+    return fit_eval_estimators
+def label_metrics(score_fun, y_true, y_pred):
+    if len(y_true.shape) > 1 and y_true.shape[1] > 1:
+        scores = []
+        for i in range(y_true.shape[1]):
+            scores.append(score_fun(y_true[:,i],y_pred[:,i]))
+        return scores
+    score = score_fun(y_true.ravel(), y_pred.ravel())
+    if isinstance(score, list):
+        return score
+    elif isinstance(score, np.ndarray):
+        return score.tolist()
+    else:
+        return [score]
+def metrics_for_estimators(estimators, score_fun, X, y_true):
+    metrics = {}
+    for name, estimator in estimators:
+        y_pred = estimator.predict(X)
+        metrics[name] = label_metrics(score_fun, y_true, y_pred)
+    return metrics

src/models.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .roberta_regressor import RobertaRegressor
+from .embeddings import EmbeddingsRegressor
+class EmbeddingsSimpleRegressor

src/multiregression.py ADDED Viewed

	@@ -0,0 +1,108 @@

+from typing import List, Tuple, Dict, Any, Union
+from copy import deepcopy
+import numpy as np
+import sklearn
+from sklearn.base import BaseEstimator, RegressorMixin
+from sklearn.multioutput import MultiOutputRegressor
+from sklearn.pipeline import Pipeline
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+from . import utils
+class RegChainWithPCA(BaseEstimator, RegressorMixin):
+    def __init__(
+            self,
+            base_regressor:sklearn.base.BaseEstimator,
+            num_components:float=0.97,
+            pca_exclude_first:bool=True,
+            **fit_params):
+        """
+        This chain works like sklearn.multioutput.RegressorChain,
+        but applies PCA to reduce the dimensionality of the input data of the chain.
+        By default, the first target is excluded from the PCA transformation.
+        That is, it is fitted with the original input data while the rest of the targets
+        are fitted with the PCA-transformed input data.
+        Parameters
+        ----------
+        base_regressor : sklearn.base.BaseEstimator
+            The base regressor to be used in the chain.
+        num_components : float, optional
+            The number of components to keep in the PCA transformation.
+            If float, it is the ratio of variance to be kept.
+            If int, it is the number of components to keep.
+            The default is 0.97.
+        pca_exclude_first : bool, optional
+            If True the first target is excluded from the PCA transformation.
+            If False all targets including the first are fitted with the PCA-transformed input data.
+            The default is True.
+        **fit_params :
+            Additional parameters to be passed to the fit method of the base regressor.
+        """
+        self.base_regressor = base_regressor
+        self.num_components = num_components
+        self.pca_exclude_first = pca_exclude_first
+        self.estimators = None
+        self.pipes = None
+        self.fit_params = fit_params
+    def fit_pipe(self, X, num_components=None):
+        if num_components is None:
+            num_components = self.num_components
+        pipe = Pipeline([
+            ('scaler', StandardScaler()),
+            ('pca', PCA(n_components=self.num_components)),
+        ])
+        pipe.fit(X)
+        self.pipe = pipe
+        return pipe
+    def fit(self, X, y, **fit_params):
+        fit_params_ = self.fit_params.copy()
+        fit_params_.update(fit_params)
+        pipe = self.fit_pipe(X)
+        Y_pred_chain = np.zeros((X.shape[0], y.shape[1]))
+        X_transformed = pipe.transform(X)
+        num_components_pca = X_transformed.shape[1]
+        X_aug = np.hstack((X_transformed, Y_pred_chain))
+        self.estimators = [deepcopy(self.base_regressor) for _ in range(y.shape[1])]
+        del Y_pred_chain, X_transformed
+        for idx, estimator in enumerate(self.estimators):
+            if idx == 0 and self.pca_exclude_first:
+                estimator.fit(X, y[:, idx], **fit_params_)
+            else:
+                estimator.fit(X_aug[:, : (num_components_pca + idx)], y[:, idx], **fit_params_)
+            if idx < y.shape[1] - 1:
+                if idx == 0 and self.pca_exclude_first:
+                    X_aug[:, num_components_pca + idx] = estimator.predict(X)
+                else:
+                    X_aug[:, num_components_pca + idx] = estimator.predict(X_aug[:, : (num_components_pca + idx)])
+    def predict(self, X):
+        Y_pred_chain = np.zeros((X.shape[0], len(self.estimators)))
+        X_transformed = self.pipe.transform(X)
+        X_aug = np.hstack((X_transformed, Y_pred_chain))
+        for idx, estimator in enumerate(self.estimators):
+            if idx == 0 and self.pca_exclude_first:
+                Y_pred_chain[:, idx] = estimator.predict(X)
+            else:
+                Y_pred_chain[:, idx] = estimator.predict(X_aug[:, : (X_transformed.shape[1] + idx)])
+            if idx < len(self.estimators) - 1:
+                X_aug[:, X_transformed.shape[1] + idx] = Y_pred_chain[:, idx]
+        return Y_pred_chain
+    def score(self, X, y):
+        return utils.comp_score(y, self.predict(X))
+    def get_params(self, deep=True):
+        return {
+            'base_regressor': self.base_regressor,
+            'num_components': self.num_components,
+            'pca_exclude_first': self.pca_exclude_first,
+            **self.fit_params
+        }

src/roberta_regressor.py ADDED Viewed

	@@ -0,0 +1,196 @@

+"""
+Defines a wrapper class of RobertaPreTrainedModel model to do regression on text data.
+Based on: https://www.kaggle.com/code/sumantindurkhya/bert-for-regression
+"""
+from typing import Optional, Tuple, Union
+from tqdm import tqdm, trange
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers import BertModel, BertPreTrainedModel, RobertaPreTrainedModel, RobertaModel
+class RobertaRegressor(RobertaPreTrainedModel):
+    def __init__(self, config, num_outputs=1, dropout=0.1, freeze_bert=False):
+        super().__init__(config)
+        self.num_outputs = num_outputs
+        self.roberta = RobertaModel(config)
+        if freeze_bert:
+            # freeze the roberta parameters
+            for param in self.roberta.parameters():
+                param.requires_grad = False
+        self.classifier = nn.Linear(config.hidden_size, 128)
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(dropout)
+        self.tanh = nn.Tanh()
+        self.regressor = nn.Linear(128, num_outputs)
+    def forward(self, input_ids, attention_mask):
+        # forward pass of the model
+        base_out = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
+        logits = base_out.pooler_output
+        out = self.classifier(logits)
+        out = self.dropout(out)
+        out = self.relu(out)
+        out = self.tanh(out)
+        out = self.dropout(out)
+        out = self.regressor(out)
+        return out
+    def predict(self, text:str, tokenizer, device, numpy=True) -> Tuple[float, float, float, float]:
+        input_ids, attention_mask = tokenizer.encode_plus(text, padding=True, truncation=True, return_tensors='pt').values()
+        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
+        output = self(input_ids, attention_mask).squeeze()
+        # free up memory
+        del input_ids, attention_mask
+        out = output.detach()
+        if numpy:
+            return out.cpu().numpy()
+        return out
+class RobertaSeqMultiRegressor(RobertaPreTrainedModel):
+    """
+    A wrapper class of RobertaPreTrainedModel model to do multi-output regression on text data.
+    This models the task of predicting multiple outputs from a single text input.
+    The problem is formulated in a sequential manner, where the model predicts the next output
+    conditioned on the previous outputs.
+    This approach is ideal for modeling problems where the outputs are correlated
+    such as probability distributions, where the sum of the outputs must be 1.
+    Or, for example, in the case of predicting the next word in a sentence, where the
+    model must predict the next word conditioned on the previous words.
+    The model is similar to the one described in the RobertaRegressor class, with the
+    exception that the head of the model is a sequential model, where the output of the
+    previous layer is fed as input to the next layer similar to how a RNN works.
+    """
+    def __init__(self, config, num_outputs=1, dropout=0.1, freeze_bert=False):
+        super().__init__(config)
+        self.num_outputs = num_outputs
+        self.roberta = RobertaModel(config)
+        if freeze_bert:
+            # freeze the roberta parameters
+            for param in self.roberta.parameters():
+                param.requires_grad = False
+        # head of the model is a model that takes the output of the previous layer as input
+        # and outputs a single value until the number of outputs is reached
+        for i in range(num_outputs):
+            setattr(self, f"regressor_{i}", nn.Linear(config.hidden_size, 128))
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(dropout)
+        self.tanh = nn.Tanh()
+    def forward(self, input_ids, attention_mask):
+        # forward pass of the model
+        base_out = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
+        logits = base_out.pooler_output
+        outputs = []
+        for i in range(self.num_outputs):
+            out = getattr(self, f"regressor_{i}")(logits)
+            out = self.dropout(out)
+            out = self.relu(out)
+            out = self.tanh(out)
+            outputs.append(out)
+        return outputs
+def sum_diff_loss(output, target):
+    return torch.sum(torch.abs(output - target))
+def evaluate(model, criterion, dataloader, device, sum_diff_penalty=False):
+    model.eval()
+    mean_acc, mean_loss, count = 0, 0, 0
+    with torch.no_grad():
+        for input_ids, attention_mask, target in (dataloader):
+            input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
+            output = model(input_ids, attention_mask)
+            mean_loss += criterion(output.squeeze(), target.type_as(output)).item()
+            count += 1
+    return mean_loss/count
+# def predict(model, dataloader, device):
+#     predicted_label = []
+#     actual_label = []
+#     with torch.no_grad():
+#         for input_ids, attention_mask, target in (dataloader):
+#             input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
+#             output = model(input_ids, attention_mask)
+#             predicted_label += output
+#             actual_label += target
+#     return predicted_label
+def train(model, criterion, optimizer, train_loader, val_loader, epochs, device):
+    best_acc = 0
+    for epoch in trange(epochs, desc="Epoch"):
+        model.train()
+        train_loss = 0
+        for i, (input_ids, attention_mask, target) in enumerate(iterable=train_loader):
+            optimizer.zero_grad()
+            input_ids, attention_mask, target = input_ids.to(device), attention_mask.to(device), target.to(device)
+            output = model(input_ids=input_ids, attention_mask=attention_mask)
+            # out = model.classifier(output)
+            loss = criterion(output.squeeze(), target.type_as(output))
+            loss.backward()
+            optimizer.step()
+            train_loss += loss.item()
+        print(f"Training loss is {train_loss/len(train_loader)}")
+        val_loss = evaluate(model=model, criterion=criterion, dataloader=val_loader, device=device)
+        print("Epoch {} complete! Validation Loss : {}".format(epoch, val_loss))
+def multi_reg_loss(loss='mse', sum_diff_penalty:float=0.0):
+    """
+    A custom loss function that penalizes the sum of differences
+    between the predicted and actual values for multi-output regression.
+    This is done to guide the model to predict outputs where
+    sum(y_hat1, y_hat2, ...) = sum(y1, y2, ...)
+    e.g: in task d, we have that sum(label1, label2, label3, label4) = 1
+    since its a probability distribution.
+    Parameters
+    ----------
+    loss : str, optional
+        The loss function to be used, by default 'mse'
+        Available options: 'mse' and 'cross_entropy'
+        for mean squared error and cross entropy loss respectively
+    sum_diff_penalty : float, optional
+        The penalty to be applied to the sum of differences between the predicted and actual values, by default 0.0 (no penalty)
+    """
+    if loss == 'mse':
+        loss_func = F.mse_loss
+    elif loss == 'cross_entropy':
+        loss_func = F.cross_entropy
+    else:
+        raise ValueError("Invalid loss function. Available options: 'mse' and 'cross_entropy'")
+    def reg_loss(input, target):
+        # first compute the normal MSE loss
+        mse = loss_func(input, target)
+        # then penalize the sum of differences between the predicted and actual values
+        sum_diff = torch.square(torch.sum(input) - torch.sum(target))
+        return mse + sum_diff_penalty*sum_diff
+    return reg_loss

src/train.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import numpy as np
+import pandas as pd
+# Embeddings
+from sentence_transformers import SentenceTransformer
+# train a classifier on the embeddings for multiclass regression
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import (
+    r2_score, mean_squared_error,  # regression metrics
+    accuracy_score, f1_score, precision_score, recall_score # classification metrics
+)
+from sklearn.multioutput import MultiOutputRegressor, RegressorChain # for multiclass regression
+# Estimators
+from sklearn.ensemble import (
+    RandomForestRegressor,
+    RandomForestClassifier,
+    GradientBoostingRegressor,
+    GradientBoostingClassifier,
+    AdaBoostRegressor,
+    AdaBoostClassifier
+)
+from sklearn.linear_model import (
+    LinearRegression,
+    LogisticRegression,
+    Ridge,
+    Lasso
+)
+# other regressors
+from sklearn.svm import SVR
+from sklearn.neighbors import KNeighborsRegressor
+from sklearn.neural_network import MLPRegressor
+from sklearn.tree import DecisionTreeRegressor
+from lightgbm import LGBMRegressor, LGBMClassifier
+# type hinting
+import os, json
+from typing import List, Callable, Dict, Tuple, Any
+# local imports
+from src import data, utils
+from src.embeddings import EmbeddingsRegressor
+def comp_score(y_true:np.ndarray,y_pred:np.ndarray)->float:
+    """
+    Metric for multiclass regression. Computes the average of the RMSE scores for each label.
+    """
+    rmse_scores = []
+    for i in range(y_true.shape[1]):
+        rmse_scores.append(np.sqrt(mean_squared_error(y_true[:,i],y_pred[:,i])))
+    return np.mean(rmse_scores)
+def estimators_eval(estimators:List[Tuple[str,Any]], score_func:Callable[[np.ndarray, np.ndarray], float]):
+    def fit_eval_estimators(X_train:np.ndarray, y_train:np.ndarray, X_test:np.ndarray, y_test:np.ndarray) -> dict:
+        estimator_scores = {}
+        for name, estimator in estimators:
+            estimator.fit(X_train, y_train)
+            y_pred = estimator.predict(X_test)
+            score = score_func(y_test, y_pred)#*(1.4*((y_train>th).sum()/(len(y_train)-1))) # weighted for class imbalance
+            print(f"\"{name}\" estimator score: {score:.4f}")
+            estimator_scores[name] = score
+        return estimator_scores
+    return fit_eval_estimators
+def get_data():
+    # load the train and test data
+    train_data = data.load('train')
+    test_df = data.load('test')
+    # concat messages by subject id
+    train_data = data.concat_messages(train_data)
+    test_df = data.concat_messages(test_df)
+    # split into 15% of subject ids for validation
+    # get the classes as the argmax of the label probabilities to use them for stratification
+    subj_classes = train_data.set_index('subject_id').filter(regex='^d_')\
+        .apply(lambda x: x.argmax() if x[:-1].sum()<0.5 else x[:-1].argmax(), axis=1)\
+            .replace(dict(enumerate(train_data.filter(regex='^d_').columns)))
+    tr_subj_ids, val_subj_ids = train_test_split(subj_classes.index, test_size=0.15, random_state=42, stratify=subj_classes.values)
+    # split the train data into train and validation sets
+    val_df = train_data[train_data['subject_id'].isin(val_subj_ids)]
+    train_df = train_data[train_data['subject_id'].isin(tr_subj_ids)]
+    # augment the train data by taking only the first half of the messages
+    half_messages_df_train = train_df.assign(
+        message=lambda df: df['message'].apply(lambda x: ' | '.join(x.split(' | ')[:len(x.split(' | '))//2])),
+        # num_messages=lambda df: df['message'].apply(lambda x: len(x.split(' | ')))
+    )
+    train_df = pd.concat([train_df, half_messages_df_train], axis=0).sort_values('subject_id').reset_index(drop=True)
+    return train_df, val_df, test_df

src/utils.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from typing import List, Tuple
+import numpy as np
+import pandas as pd
+def print_messages(msgs:List[dict]):
+    """
+    Print the messages of a subject
+    Messages are a list of dictionaries of the form: [{'id_message': {int_id}, 'message': '{str_message}', 'date': '{str_date}'}, ...]
+    and are attached to an specific subject.
+    """
+    for message in msgs:
+        print(f"{message['date']} - {message['message']}")
+def load_data(files, truth):
+    """load all the data into a dataframe"""
+    import os, json
+    data = []
+    for f in files:
+        with open(f) as file:
+            msgs = json.load(file)
+            for msg in msgs:
+                data.append([os.path.basename(f).split('.')[0], msg['id_message'], msg['date'], msg['message']])
+    df = pd.DataFrame(data, columns=['subject_id', 'id_message', 'date', 'message'])
+    df = df.merge(truth, on='subject_id')
+    return df
+def normalize(x, prob=True):
+    """
+    Normalize a vector to [0,1] and sum 1 if prob=True
+    """
+    x = x.reshape(-1,4)
+    # normalize to [0,1]
+    x = ((x - x.min(axis=1)[...,None])/(x.max(axis=1)[...,None] - x.min(axis=1)[...,None])).round(4)
+    if prob:
+        # normalize to sum 1
+        x = x/x.sum(axis=1)[...,None]
+    return x.round(4)
+def label_metrics(score_fun, y_true, y_pred):
+    scores = []
+    for i in range(y_true.shape[1]):
+        scores.append(score_fun(y_true[:,i],y_pred[:,i]))
+    return scores
+def make_predict(predict_fn, **kwargs):
+    def predict(msg):
+        pred = predict_fn(msg, **kwargs)
+        return pred
+    return predict
+def comp_score(y_true:np.ndarray,y_pred:np.ndarray)->float:
+    """
+    Metric for simple and multiclass regression. Computes the average of the RMSE scores for each label.
+    """
+    from sklearn.metrics import mean_squared_error
+    rmse_scores = []
+    for i in range(y_true.shape[1]):
+        rmse_scores.append(np.sqrt(mean_squared_error(y_true[:,i],y_pred[:,i])))
+    return np.mean(rmse_scores)