In [11]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install transformers datasets
!pip install peft
!pip install evaluate scikit-learn transformers[torch]
!pip install accelerate>=0.26.0

Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-1.0.1-py3-none-any.whl.metadata (19 kB)
Downloading peft-0.13.2-py3-none-any.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hDownloading accelerate-1.0.1-py3-none-any.whl (330 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m330.9/330.9 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate, peft
Successfully installed accelerate-1.0.1 peft-0.13.2


# Prepare the dataset

In [42]:
from datasets import load_dataset

imdb = load_dataset("stanfordnlp/imdb")
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = imdb.map(tokenize_function, batched=True)


In [43]:
training_set = tokenized_datasets['train'].shuffle(42).select(range(5000))
shuffled_set =  tokenized_datasets['test'].shuffle(42)
validation_set  = shuffled_set.select(range(1000))
test_set = shuffled_set.select(range(1000, 2000))

## First method : Using the transformers library and pytorch


In [1]:
from transformers import BertTokenizer
from transformers import BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
[k for k,v  in model.bert.named_parameters()]

['embeddings.word_embeddings.weight',
 'embeddings.position_embeddings.weight',
 'embeddings.token_type_embeddings.weight',
 'embeddings.LayerNorm.weight',
 'embeddings.LayerNorm.bias',
 'encoder.layer.0.attention.self.query.weight',
 'encoder.layer.0.attention.self.query.bias',
 'encoder.layer.0.attention.self.key.weight',
 'encoder.layer.0.attention.self.key.bias',
 'encoder.layer.0.attention.self.value.weight',
 'encoder.layer.0.attention.self.value.bias',
 'encoder.layer.0.attention.output.dense.weight',
 'encoder.layer.0.attention.output.dense.bias',
 'encoder.layer.0.attention.output.LayerNorm.weight',
 'encoder.layer.0.attention.output.LayerNorm.bias',
 'encoder.layer.0.intermediate.dense.weight',
 'encoder.layer.0.intermediate.dense.bias',
 'encoder.layer.0.output.dense.weight',
 'encoder.layer.0.output.dense.bias',
 'encoder.layer.0.output.LayerNorm.weight',
 'encoder.layer.0.output.LayerNorm.bias',
 'encoder.layer.1.attention.self.query.weight',
 'encoder.layer.1.attention.se

In [45]:
print(f"What is the model \n\n {model}")


What is the model 

 BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Lay

### Create the LoRA module

We will consider in this module the original linear, and the down and up projection

In [None]:
import torch 
import copy
from torch import nn

class LoRALinear(nn.Module):
  def __init__(
    self, in_dim: int, out_dim: int, rank: int
  ):
    super().__init__()
    raise NotImplementedError
  def forward(self, x: torch.Tensor) -> torch.Tensor:
    raise NotImplementedError


### Create a function replacing linear by lora module
From the original Linear module return a LoRA module
* The linear of the LoRA linear must be initialised with pretrained weights

In [47]:


def linear_to_lora(linear):
    linear_weight = linear.weight.data
    has_bias = linear.bias is not None
    if has_bias:
        linear_bias = linear.bias.data
    output_size, input_size =  linear_weight.shape
    lora = LoRALinear(input_size, output_size, rank=8)
    lora.linear.weight.data = linear_weight
    if has_bias:
        lora.linear.bias.data = linear_bias
    return lora
    


We now replace the target linear by the LoRALinear described above

In [None]:
lora_model = copy.deepcopy(model)
lora_parameters = []
for block in lora_model.bert.encoder.layer:
    raise NotImplementedError

In [49]:
print(f"What is the model modified with LoRA: \n\n {lora_model}")


What is the model modified with LoRA: 

 BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): LoRALinear(
                (linear): Linear(in_features=768, out_features=768, bias=True)
                (lora_a): Linear(in_features=768, out_features=8, bias=False)
                (lora_b): Linear(in_features=8, out_features=768, bias=False)
              )
              (key): LoRALinear(
                (linear): Linear(in_features=768, out_features=768, bias=True)
                (lora

### Defining module requiring grad

In [None]:
for k,v  in lora_model.bert.named_parameters():
    print(k)
    if ('lora' in k):
        v.requires_grad = True
    else:
        v.requires_grad = False

## Using transformers Trainer to fine-tune with LoRA 

In [51]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [52]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_custom_lora", 
                                  eval_strategy="steps",
                                  eval_steps= 128,
                                  num_train_epochs=2,)
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset= training_set,
    eval_dataset=validation_set,
    compute_metrics=compute_metrics,
)


Step,Training Loss,Validation Loss,Accuracy
128,No log,0.677891,0.566
256,No log,0.614952,0.684
384,No log,0.502475,0.774
512,0.619000,0.418735,0.815
640,0.619000,0.381323,0.836
768,0.619000,0.371402,0.835
896,0.619000,0.354594,0.848
1024,0.388000,0.347619,0.851
1152,0.388000,0.342413,0.855


TrainOutput(global_step=1250, training_loss=0.47044661254882814, metrics={'train_runtime': 171.7994, 'train_samples_per_second': 58.207, 'train_steps_per_second': 7.276, 'total_flos': 2644700098560000.0, 'train_loss': 0.47044661254882814, 'epoch': 2.0})

## Using PEFT library to fine-tune with LoRA 

In [54]:
from peft import LoraConfig, TaskType,  get_peft_model

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=1, lora_alpha=1, lora_dropout=0.1
)

model = BertForSequenceClassification.from_pretrained(
    'bert-base-cased', 
    num_labels=2
)
peft_model = get_peft_model(model, lora_config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [56]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_custom_lora", 
                                  eval_strategy="steps",
                                  eval_steps= 128,
                                  num_train_epochs=2,)
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset= training_set,
    eval_dataset=validation_set,
    compute_metrics=compute_metrics,
)

In [57]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
128,No log,0.686927,0.548
256,No log,0.677564,0.593
384,No log,0.668519,0.616
512,0.689200,0.655116,0.635
640,0.689200,0.653552,0.616
768,0.689200,0.630284,0.65
896,0.689200,0.620816,0.659
1024,0.652700,0.614406,0.658
1152,0.652700,0.610755,0.665


TrainOutput(global_step=1250, training_loss=0.6643443115234375, metrics={'train_runtime': 170.7475, 'train_samples_per_second': 58.566, 'train_steps_per_second': 7.321, 'total_flos': 2632290263040000.0, 'train_loss': 0.6643443115234375, 'epoch': 2.0})