🤗 HuggingFace Training Example
Ref: This Notebook comes from HuggingFace Examples 🤗¶
# installation
# !pip install git+https://github.com/gradsflow/gradsflow@main -q -U
# !pip install -U transformers datasets -q
# ! wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# ! tar -xf aclImdb_v1.tar.gz
This data is organized into pos
and neg
folders with one text file per example. Let's write a function that can
read this in.
from pathlib import Path
def read_imdb_split(split_dir):
split_dir = Path(split_dir)
texts = []
labels = []
for label_dir in ["pos", "neg"]:
for text_file in (split_dir / label_dir).iterdir():
texts.append(text_file.read_text())
labels.append(0 if label_dir is "neg" else 1)
return texts, labels
train_texts, train_labels = read_imdb_split("aclImdb/train")
test_texts, test_labels = read_imdb_split("aclImdb/test")
We now have a train and test dataset, but let's also also create a validation set which we can use for for evaluation and tuning without tainting our test set results. Sklearn has a convenient utility for creating such splits:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.3)
Alright, we've read in our dataset. Now let's tackle tokenization. We'll eventually train a classifier using pre-trained DistilBert, so let's use the DistilBert tokenizer.
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
Now we can simply pass our texts to the tokenizer. We'll pass truncation=True
and padding=True
, which will
ensure that all of our sequences are padded to the same length and are truncated to be no longer model's maximum input
length. This will allow us to feed batches of sequences into the model at the same time.
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
Now, let's turn our labels and encodings into a Dataset object. In PyTorch, this is done by subclassing a
torch.utils.data.Dataset
object and implementing __len__
and __getitem__
. In TensorFlow, we pass our input
encodings and labels to the from_tensor_slices
constructor method. We put the data in this format so that the data
can be easily batched such that each key in the batch encoding corresponds to a named parameter of the
DistilBertForSequenceClassification.forward
method of the model we will train.
import torch
class IMDbDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item["labels"] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)
Now that our datasets our ready, we can fine-tune a model either with the 🤗
Trainer
/TFTrainer
or with native PyTorch/TensorFlow. See training.
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight'] - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
from gradsflow import Model, AutoDataset
class GFModel(Model):
def __init__(self, learner):
super().__init__(learner, accelerator_config={"fp16": True})
def compile(self, metrics):
optimizer = AdamW(self.learner.parameters(), lr=5e-5)
self.optimizer = self.prepare_optimizer(optimizer)
super().compile(metrics=metrics)
def train_step(self, batch):
self.optimizer.zero_grad()
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
outputs = self.learner(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs[0]
self.backward(loss)
self.optimizer.step()
self.tracker.track("train/step_loss", loss, render=True)
return {"loss": loss, "logits": outputs[1].cpu(), "target": labels.cpu()}
def val_step(self, batch):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
outputs = self.learner(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs[0]
self.tracker.track("val/step_loss", loss, render=True)
return {"loss": loss, "logits": outputs[1].cpu(), "target": labels.cpu()}
data = AutoDataset(train_dataloader=train_loader, val_dataloader=val_loader)
gf_model = GFModel(
model,
)
gf_model.compile(metrics="accuracy")
gf_model.device
device(type='cuda')
gf_model.fit(data, show_progress=True)
Output()
Tracker(max_epochs=0, current_epoch=0, current_step=1094, steps_per_epoch=None, train=TrackingValues(loss=0.2676990067182042, steps=1094, step_loss=0.28964027762413025, metrics={'Accuracy': tensor(0.8925)}), val=TrackingValues(loss=0.20485645957120358, steps=469, step_loss=tensor(0.1323, device='cuda:0'), metrics={'Accuracy': tensor(0.9188)}))
gf_model.tracker.train
TrackingValues(loss=0.2676990067182042, steps=1094, step_loss=0.28964027762413025, metrics={'Accuracy': tensor(0.8925)})
gf_model.tracker.val
TrackingValues(loss=0.20485645957120358, steps=469, step_loss=tensor(0.1323, device='cuda:0'), metrics={'Accuracy': tensor(0.9188)})