import pandas as pd import os from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, RobertaTokenizer, RobertaForSequenceClassification, GPT2Tokenizer, GPT2ForSequenceClassification import torch from torch.utils.data import Dataset torch.cuda.empty_cache() class MultiLabelClassifierDataset(Dataset): def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels def __getitem__(self, idx): item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} item['labels'] = torch.tensor(self.labels[idx]).float() return item def __len__(self): return len(self.labels) work_dir = os.path.dirname(os.path.realpath(__file__)) + '/' dataset_dir = work_dir + 'jigsaw-toxic-comment-classification-challenge/' classifiers = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] df = pd.read_csv(dataset_dir + 'train.csv') df = df.sample(frac=1).reset_index(drop=True) # Shuffle train_df = df[:int(len(df)*0.1)] train_labels = train_df[classifiers].to_numpy() device = torch.device('cuda') print("Using device: ", device) training_args = TrainingArguments( output_dir='./results', num_train_epochs=2, per_device_train_batch_size=32, per_device_eval_batch_size=64, warmup_steps=500, weight_decay=0.01, logging_dir='./logs', logging_steps=10, fp16=True ) print("BERT") bert_dir = work_dir + 'bert/' print("Model base: ", "vinai/bertweet-base") tokenizer = AutoTokenizer.from_pretrained( "vinai/bertweet-base", model_max_length=128) train_encodings = tokenizer( train_df['comment_text'].tolist(), truncation=True, padding=True) print("Training model to be stored in" + bert_dir) print("Creating dataset") train_dataset = MultiLabelClassifierDataset(train_encodings, train_labels) print("Loading model for training...") model = AutoModelForSequenceClassification.from_pretrained( 'vinai/bertweet-base', num_labels=6) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset ) trainer.train() trainer.save_model(bert_dir + '_bert_model') training_args = TrainingArguments( output_dir='./results', num_train_epochs=1, per_device_train_batch_size=32, per_device_eval_batch_size=16, warmup_steps=500, weight_decay=0.01, logging_dir='./logs', logging_steps=10, fp16=True ) print("RoBERTa") roberta_dir = work_dir + 'roberta/' tokenizer = RobertaTokenizer.from_pretrained( 'roberta-base', model_max_length=128) train_encodings = tokenizer( train_df['comment_text'].tolist(), truncation=True, padding=True) train_dataset = MultiLabelClassifierDataset(train_encodings, train_labels) model = AutoModelForSequenceClassification.from_pretrained( 'roberta-base', num_labels=6) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset ) trainer.train() trainer.save_model(roberta_dir + '_roberta_model') training_args = TrainingArguments( output_dir='./results', num_train_epochs=1, per_device_train_batch_size=32, per_device_eval_batch_size=64, warmup_steps=500, weight_decay=0.01, logging_dir='./logs', logging_steps=10, fp16=True ) print("DISTILBERT") distilbert_dir = work_dir + 'distilbert/' tokenizer = AutoTokenizer.from_pretrained( 'distilbert-base-cased', model_max_length=128) train_encodings = tokenizer( train_df['comment_text'].tolist(), truncation=True, padding=True) train_dataset = MultiLabelClassifierDataset(train_encodings, train_labels) model = AutoModelForSequenceClassification.from_pretrained( 'distilbert-base-cased', num_labels=6) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset ) trainer.train() trainer.save_model(distilbert_dir + '_distilbert_model')