Mile-stone-3 / train.py
kya5's picture
Duplicate from kya5/milestone-3
9047480
raw
history blame contribute delete
No virus
3.95 kB
import pandas as pd
import os
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, RobertaTokenizer, RobertaForSequenceClassification, GPT2Tokenizer, GPT2ForSequenceClassification
import torch
from torch.utils.data import Dataset
torch.cuda.empty_cache()
class MultiLabelClassifierDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx])
for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx]).float()
return item
def __len__(self):
return len(self.labels)
work_dir = os.path.dirname(os.path.realpath(__file__)) + '/'
dataset_dir = work_dir + 'jigsaw-toxic-comment-classification-challenge/'
classifiers = ['toxic', 'severe_toxic', 'obscene',
'threat', 'insult', 'identity_hate']
df = pd.read_csv(dataset_dir + 'train.csv')
df = df.sample(frac=1).reset_index(drop=True) # Shuffle
train_df = df[:int(len(df)*0.1)]
train_labels = train_df[classifiers].to_numpy()
device = torch.device('cuda')
print("Using device: ", device)
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=2,
per_device_train_batch_size=32,
per_device_eval_batch_size=64,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
fp16=True
)
print("BERT")
bert_dir = work_dir + 'bert/'
print("Model base: ", "vinai/bertweet-base")
tokenizer = AutoTokenizer.from_pretrained(
"vinai/bertweet-base", model_max_length=128)
train_encodings = tokenizer(
train_df['comment_text'].tolist(), truncation=True, padding=True)
print("Training model to be stored in" + bert_dir)
print("Creating dataset")
train_dataset = MultiLabelClassifierDataset(train_encodings, train_labels)
print("Loading model for training...")
model = AutoModelForSequenceClassification.from_pretrained(
'vinai/bertweet-base', num_labels=6)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset
)
trainer.train()
trainer.save_model(bert_dir + '_bert_model')
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=1,
per_device_train_batch_size=32,
per_device_eval_batch_size=16,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
fp16=True
)
print("RoBERTa")
roberta_dir = work_dir + 'roberta/'
tokenizer = RobertaTokenizer.from_pretrained(
'roberta-base', model_max_length=128)
train_encodings = tokenizer(
train_df['comment_text'].tolist(), truncation=True, padding=True)
train_dataset = MultiLabelClassifierDataset(train_encodings, train_labels)
model = AutoModelForSequenceClassification.from_pretrained(
'roberta-base', num_labels=6)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset
)
trainer.train()
trainer.save_model(roberta_dir + '_roberta_model')
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=1,
per_device_train_batch_size=32,
per_device_eval_batch_size=64,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
fp16=True
)
print("DISTILBERT")
distilbert_dir = work_dir + 'distilbert/'
tokenizer = AutoTokenizer.from_pretrained(
'distilbert-base-cased', model_max_length=128)
train_encodings = tokenizer(
train_df['comment_text'].tolist(), truncation=True, padding=True)
train_dataset = MultiLabelClassifierDataset(train_encodings, train_labels)
model = AutoModelForSequenceClassification.from_pretrained(
'distilbert-base-cased', num_labels=6)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset
)
trainer.train()
trainer.save_model(distilbert_dir + '_distilbert_model')