In [2]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [3]:
# Load the IMDb dataset
dataset = load_dataset("imdb")

# Tokenizer function
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Format for PyTorch
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(10000)) # Subset for training
test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) # Subset for testing

train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [4]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier

In [5]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy='steps',  
    save_strategy='steps',        
    load_best_model_at_end=True,
    logging_steps=50,             
    save_steps=50                 
)


In [6]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [7]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()


The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10000
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1875
  Number of trainable parameters = 66955010


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.6888,0.680938,0.661,0.543742,0.792157,0.413934
100,0.629,0.465259,0.841,0.819113,0.920716,0.737705
150,0.3712,0.323407,0.868,0.86747,0.850394,0.885246
200,0.3363,0.37415,0.857,0.836197,0.948052,0.747951
250,0.3367,0.312763,0.865,0.871795,0.812389,0.940574
300,0.3118,0.296506,0.889,0.88254,0.912473,0.854508
350,0.3098,0.286319,0.886,0.886228,0.863813,0.909836
400,0.2723,0.292773,0.89,0.884696,0.905579,0.864754
450,0.3151,0.419856,0.854,0.831019,0.954787,0.735656
500,0.3509,0.298303,0.862,0.869565,0.807018,0.942623


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
Saving model checkpoint to ./results\checkpoint-50
Configuration saved in ./results\checkpoint-50\config.json
Model weights saved in ./results\checkpoint-50\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
Saving model checkpoint to ./results\checkpoint-100
Configuration saved in ./results\checkpoint-100\config.json
Model weights saved in ./result

Model weights saved in ./results\checkpoint-800\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
Saving model checkpoint to ./results\checkpoint-850
Configuration saved in ./results\checkpoint-850\config.json
Model weights saved in ./results\checkpoint-850\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
Saving model checkpoint to ./results\checkpoint-900
Configuration saved in

  Batch size = 64
Saving model checkpoint to ./results\checkpoint-1600
Configuration saved in ./results\checkpoint-1600\config.json
Model weights saved in ./results\checkpoint-1600\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
Saving model checkpoint to ./results\checkpoint-1650
Configuration saved in ./results\checkpoint-1650\config.json
Model weights saved in ./results\checkpoint-1650\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Run

TrainOutput(global_step=1875, training_loss=0.22492422332763673, metrics={'train_runtime': 50814.837, 'train_samples_per_second': 0.59, 'train_steps_per_second': 0.037, 'total_flos': 3974021959680000.0, 'train_loss': 0.22492422332763673, 'epoch': 3.0})

In [8]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64


{'eval_loss': 0.23037973046302795,
 'eval_accuracy': 0.91,
 'eval_f1': 0.9085365853658537,
 'eval_precision': 0.9012096774193549,
 'eval_recall': 0.9159836065573771,
 'eval_runtime': 450.0402,
 'eval_samples_per_second': 2.222,
 'eval_steps_per_second': 0.036,
 'epoch': 3.0}

In [9]:
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')

Configuration saved in ./saved_model\config.json
Model weights saved in ./saved_model\pytorch_model.bin
tokenizer config file saved in ./saved_model\tokenizer_config.json
Special tokens file saved in ./saved_model\special_tokens_map.json


('./saved_model\\tokenizer_config.json',
 './saved_model\\special_tokens_map.json',
 './saved_model\\vocab.txt',
 './saved_model\\added_tokens.json',
 './saved_model\\tokenizer.json')

In [10]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    with torch.no_grad():
        logits = model(**inputs).logits
    prediction = logits.argmax(-1).item()
    return 'positive' if prediction == 1 else 'negative'

# Test with a new sentence
print(predict_sentiment("This movie was great! I loved it."))


positive


In [3]:
model.save_pretrained('./Sentimental_Analysis')
tokenizer.save_pretrained('./Sentimental_Analysis')


NameError: name 'model' is not defined