Overfitting when fine-tuning BERT sentiment analysis

Question

I am newbie to Machine Learning in general. I am currently trying to follow a tutorial on sentiment analysis using BERT and Transformers https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/

However when I train the model it has appeared that the model is overfitting

I do not know how to fix this. I have tried lowering amount of epochs, increasing batch size , shuffling my data (which is ordered) and increasing the validation split. So far nothing has worked. I have even tried changing different learning rate but the one I am using now is the smallest.

Below is my code:

PRE_TRAINED_MODEL_NAME = 'TurkuNLP/bert-base-finnish-cased-v1'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

MAX_LEN = 40

#Make a PyTorch dataset
class FIDataset(Dataset):

  def __init__(self, texts, targets, tokenizer, max_len):

    self.texts = texts

    self.targets = targets

    self.tokenizer = tokenizer

    self.max_len = max_len

  def __len__(self):

    return len(self.texts)

  def __getitem__(self, item):

    text = str(self.texts[item])

    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(

      text,

      add_special_tokens=True,

      max_length=self.max_len,

      return_token_type_ids=False,

      pad_to_max_length=True,

      return_attention_mask=True,

      return_tensors='pt',

    )

    return {

      'text': text,

      'input_ids': encoding['input_ids'].flatten(),

      'attention_mask': encoding['attention_mask'].flatten(),

      'targets': torch.tensor(target, dtype=torch.long)

    }

#split test and train
df_train, df_test = train_test_split(

  df,

  test_size=0.1,

  random_state=RANDOM_SEED

)

df_val, df_test = train_test_split(

  df_test,

  test_size=0.5,

  random_state=RANDOM_SEED

)


#data loader function
def create_data_loader(df, tokenizer, max_len, batch_size):

  ds = FIDataset(

    texts=df.content.to_numpy(),

    targets=df.sentiment.to_numpy(),

    tokenizer=tokenizer,

    max_len=max_len

  )

  return DataLoader(

    ds,

    batch_size=batch_size,

    num_workers=4

  )

BATCH_SIZE = 32

#Load data into train, test, val
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)

val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)

test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

#Bert model loading
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

# Sentiment Classifier based on Bert model just loaded
class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):

    super(SentimentClassifier, self).__init__()

    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

    self.drop = nn.Dropout(p=0.1)

    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    returned = self.bert(
        
        input_ids=input_ids,
        attention_mask=attention_mask
    )
    pooled_output = returned["pooler_output"]
    output = self.drop(pooled_output)
    
    return self.out(output)

#Create a Classifier instance and move to GPU
model = SentimentClassifier(3)

model = model.to(device)

#Optimize with AdamW
EPOCHS = 6

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(

  optimizer,

  num_warmup_steps=0,

  num_training_steps=total_steps

)

loss_fn = nn.CrossEntropyLoss().to(device)

#Train each Epoch function
def train_epoch(

  model,

  data_loader,

  loss_fn,

  optimizer,

  device,

  scheduler,

  n_examples

):

  model = model.train()

  losses = []

  correct_predictions = 0

  for d in data_loader:

    input_ids = d["input_ids"].to(device)

    attention_mask = d["attention_mask"].to(device)

    targets = d["targets"].to(device)

    outputs = model(

      input_ids=input_ids,

      attention_mask=attention_mask

    )

    _, preds = torch.max(outputs, dim=1)

    loss = loss_fn(outputs, targets)

    correct_predictions += torch.sum(preds == targets)

    losses.append(loss.item())

    loss.backward()

    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

    optimizer.step()

    scheduler.step()

    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

import torch

history = defaultdict(list)

best_accuracy = 0

if __name__ == '__main__':    
    for epoch in range(EPOCHS):

      print(f'Epoch {epoch + 1}/{EPOCHS}')

      print('-' * 10)

      train_acc, train_loss = train_epoch(

        model,

        train_data_loader,

        loss_fn,

        optimizer,

        device,

        scheduler,

        len(df_train)

      )

      print(f'Train loss {train_loss} accuracy {train_acc}')

      val_acc, val_loss = eval_model(

        model,

        val_data_loader,

        loss_fn,

        device,

        len(df_val)

      )

      print(f'Val   loss {val_loss} accuracy {val_acc}')

      print()

      history['train_acc'].append(train_acc)

      history['train_loss'].append(train_loss)

      history['val_acc'].append(val_acc)

      history['val_loss'].append(val_loss)

      if val_acc > best_accuracy:

        torch.save(model.state_dict(), 'best_model_state.bin')

        best_accuracy = val_acc

score 7 · Answer 1 · answered Jun 27 '21 at 04:24

7

Broadly speaking, to reduce overfitting, you can:

increase regularization
reduce model complexity
perform early stopping
increase training data

From what you've written, you've already tried 3 and 4. In the case of neural networks, you can increase regularization by increasing dropout. You already have the code for it.

# NOTE: You don't need bert_model here since you're creating one inside
# of SentimentClassifier.
#bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

# Sentiment Classifier based on Bert model just loaded
class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):

    super(SentimentClassifier, self).__init__()

    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

    self.drop = nn.Dropout(p=0.1) # <-- INCREASE THIS VALUE

    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

I'd recommend trying higher values of the Dropout probability, as I noted in your code above ("INCREASE THIS VALUE"). Keep track of the Dropout probability and the resulting observed overfitting. Try probability values of 0.1, 0.2, 0.3, 0.4, 0.5.

Usually, I've found that dropout over 0.5 doesn't do much good.

answered Jun 27 '21 at 04:24

stackoverflowuser2010

38,621
48
169
217

Hello, Thanks for your comment. I have tried to replace the old data with new data of customers review from our store and the situation is now that my training accuracy is <1% while the test accuracy hovers around 60%- Do you have some idea how to fix this or is it that our data is entirely not suitable for Bert? – hhp Jul 01 '21 at 14:32
Are you still trying to do sentiment analysis? It's unlikely that BERT is at fault. Did you double-check your new dataset? Is the data in `utf-8`? Are you able to parse the sentences and the ground-truth scores from that data? Can you see the data in your `Dataset` and `Dataloader`, and is what you expect? – stackoverflowuser2010 Jul 01 '21 at 16:07
Hello I cannot encode my texts as utf-8 because they are not English but Finnish. Currently the encoding is ISO-8859-1. I have printed out a part of the dataset df.head() and they look correct. I have also tried to have the labeled data the same exact amount (i.e negative = positive = neutral = 2247 samples. in total 6741 samples) – hhp Jul 02 '21 at 12:04
Ok. You should have said you had a Finnish dataset in the first place. BERT is pre-trained on largely English text. I googled "BERT Finnish" and found several hits for new BERT models that have been pre-trained on Finnish. You should do the same. – stackoverflowuser2010 Jul 02 '21 at 16:58
Hello, as you can see in my code I used a Finnish Bert model – hhp Jul 05 '21 at 05:50

Overfitting when fine-tuning BERT sentiment analysis

1 Answers1