🤗 HuggingFace Training Example

Ref: This Notebook comes from HuggingFace Examples 🤗¶

In [ ]:

# installation
# !pip install git+https://github.com/gradsflow/gradsflow@main -q -U
# !pip install -U transformers datasets -q
# installation
# !pip install git+https://github.com/gradsflow/gradsflow@main -q -U
# !pip install -U transformers datasets -q

In [ ]:

# ! wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# ! tar -xf aclImdb_v1.tar.gz
# ! wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# ! tar -xf aclImdb_v1.tar.gz

This data is organized into pos and neg folders with one text file per example. Let's write a function that can read this in.

In [ ]:

from pathlib import Path


def read_imdb_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos", "neg"]:
        for text_file in (split_dir / label_dir).iterdir():
            texts.append(text_file.read_text())
            labels.append(0 if label_dir is "neg" else 1)

    return texts, labels


train_texts, train_labels = read_imdb_split("aclImdb/train")
test_texts, test_labels = read_imdb_split("aclImdb/test")
from pathlib import Path


def read_imdb_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos", "neg"]:
        for text_file in (split_dir / label_dir).iterdir():
            texts.append(text_file.read_text())
            labels.append(0 if label_dir is "neg" else 1)

    return texts, labels


train_texts, train_labels = read_imdb_split("aclImdb/train")
test_texts, test_labels = read_imdb_split("aclImdb/test")

We now have a train and test dataset, but let's also also create a validation set which we can use for for evaluation and tuning without tainting our test set results. Sklearn has a convenient utility for creating such splits:

In [ ]:

from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.3)
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.3)

Alright, we've read in our dataset. Now let's tackle tokenization. We'll eventually train a classifier using pre-trained DistilBert, so let's use the DistilBert tokenizer.

In [ ]:

from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

Now we can simply pass our texts to the tokenizer. We'll pass truncation=True and padding=True, which will ensure that all of our sequences are padded to the same length and are truncated to be no longer model's maximum input length. This will allow us to feed batches of sequences into the model at the same time.

In [ ]:

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

Now, let's turn our labels and encodings into a Dataset object. In PyTorch, this is done by subclassing a torch.utils.data.Dataset object and implementing __len__ and __getitem__. In TensorFlow, we pass our input encodings and labels to the from_tensor_slices constructor method. We put the data in this format so that the data can be easily batched such that each key in the batch encoding corresponds to a named parameter of the DistilBertForSequenceClassification.forward method of the model we will train.

In [ ]:

import torch


class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)
import torch


class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

Now that our datasets our ready, we can fine-tune a model either with the 🤗 Trainer/TFTrainer or with native PyTorch/TensorFlow. See training.

In [ ]:

from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

In [ ]:

from gradsflow import Model, AutoDataset


class GFModel(Model):
    def __init__(self, learner):
        super().__init__(learner, accelerator_config={"fp16": True})

    def compile(self, metrics):
        optimizer = AdamW(self.learner.parameters(), lr=5e-5)
        self.optimizer = self.prepare_optimizer(optimizer)
        super().compile(metrics=metrics)

    def train_step(self, batch):
        self.optimizer.zero_grad()

        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        outputs = self.learner(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        self.backward(loss)
        self.optimizer.step()

        self.tracker.track("train/step_loss", loss, render=True)
        return {"loss": loss, "logits": outputs[1].cpu(), "target": labels.cpu()}

    def val_step(self, batch):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        outputs = self.learner(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        self.tracker.track("val/step_loss", loss, render=True)
        return {"loss": loss, "logits": outputs[1].cpu(), "target": labels.cpu()}
from gradsflow import Model, AutoDataset


class GFModel(Model):
    def __init__(self, learner):
        super().__init__(learner, accelerator_config={"fp16": True})

    def compile(self, metrics):
        optimizer = AdamW(self.learner.parameters(), lr=5e-5)
        self.optimizer = self.prepare_optimizer(optimizer)
        super().compile(metrics=metrics)

    def train_step(self, batch):
        self.optimizer.zero_grad()

        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        outputs = self.learner(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        self.backward(loss)
        self.optimizer.step()

        self.tracker.track("train/step_loss", loss, render=True)
        return {"loss": loss, "logits": outputs[1].cpu(), "target": labels.cpu()}

    def val_step(self, batch):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        outputs = self.learner(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        self.tracker.track("val/step_loss", loss, render=True)
        return {"loss": loss, "logits": outputs[1].cpu(), "target": labels.cpu()}

In [ ]:

data = AutoDataset(train_dataloader=train_loader, val_dataloader=val_loader)

gf_model = GFModel(
    model,
)
gf_model.compile(metrics="accuracy")

gf_model.device
data = AutoDataset(train_dataloader=train_loader, val_dataloader=val_loader)

gf_model = GFModel(
    model,
)
gf_model.compile(metrics="accuracy")

gf_model.device

Out[ ]:

device(type='cuda')

In [ ]:

gf_model.fit(data, show_progress=True)
gf_model.fit(data, show_progress=True)

Output()

Out[ ]:

Tracker(max_epochs=0, current_epoch=0, current_step=1094, steps_per_epoch=None, train=TrackingValues(loss=0.2676990067182042, steps=1094, step_loss=0.28964027762413025, metrics={'Accuracy': tensor(0.8925)}), val=TrackingValues(loss=0.20485645957120358, steps=469, step_loss=tensor(0.1323, device='cuda:0'), metrics={'Accuracy': tensor(0.9188)}))

In [ ]:

gf_model.tracker.train
gf_model.tracker.train

Out[ ]:

TrackingValues(loss=0.2676990067182042, steps=1094, step_loss=0.28964027762413025, metrics={'Accuracy': tensor(0.8925)})

In [ ]:

gf_model.tracker.val
gf_model.tracker.val

Out[ ]:

TrackingValues(loss=0.20485645957120358, steps=469, step_loss=tensor(0.1323, device='cuda:0'), metrics={'Accuracy': tensor(0.9188)})

Last update: October 6, 2021