In [None]:
import pandas as pd

from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, AutoModelForSequenceClassification

import torch

from tqdm import tqdm

from transformers import AdamW

from torch.nn import CrossEntropyLoss



# Load dataset

df = pd.read_csv("/kaggle/input/mit-plagairism-detection-dataset/train_snli.txt", delimiter='\t', header=None, names=['sentence1', 'sentence2', 'label'])

df.head()

Unnamed: 0,sentence1,sentence2,label
0,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",0
1,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",1
2,Children smiling and waving at camera,There are children present,1
3,Children smiling and waving at camera,The kids are frowning,0
4,A boy is jumping on skateboard in the middle o...,The boy skates down the sidewalk.,0


In [2]:
df.shape

(367373, 3)

In [3]:
df_ = df[:1_00_000] # Using small set for initial training just get some idea

In [None]:
df_.isna().sum() # No missing values

sentence1    0
sentence2    2
label        0
dtype: int64

In [5]:
df['sentence1'].dtype

dtype('O')

In [6]:
df['sentence2'].dtype

dtype('O')

In [7]:
# Load smolLM model and tokenizer

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-135M")

model = AutoModelForSequenceClassification.from_pretrained("HuggingFaceTB/SmolLM-135M", num_labels=2)

tokenizer_config.json:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/831 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/538M [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceTB/SmolLM-135M and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
class PlagiarismDataset(Dataset):

    def __init__(self, df_, tokenizer, max_length=128):

        self.df = df_

        self.tokenizer = tokenizer

        self.max_length = max_length

    

    def __len__(self):

        return len(self.df)

    

    def __getitem__(self, index):

        row = self.df.iloc[index]

        

        # Ensure the sentences are strings; convert or skip if not

        sentence1 = str(row['sentence1']) if not pd.isna(row['sentence1']) else ""

        sentence2 = str(row['sentence2']) if not pd.isna(row['sentence2']) else ""

        

        inputs = self.tokenizer(

            sentence1, sentence2,

            add_special_tokens=True,

            max_length=self.max_length,

            padding='max_length',

            truncation=True,

            return_tensors="pt"

        )

        

        label = torch.tensor(row['label'], dtype=torch.long)

        

        return {

            'input_ids': inputs['input_ids'].squeeze(0),

            'attention_mask': inputs['attention_mask'].squeeze(0),

            'label': label

        }





def collate_fn(batch):

    input_ids = torch.stack([item['input_ids'] for item in batch])

    attention_masks = torch.stack([item['attention_mask'] for item in batch])

    labels = torch.stack([item['label'] for item in batch])

    return {

        'input_ids': input_ids,

        'attention_mask': attention_masks,

        'label': labels

    }



# Add a new padding token

tokenizer.add_special_tokens({'pad_token': '[PAD]'})



# Resize the model's token embeddings if using a model

model.resize_token_embeddings(len(tokenizer))



# Create the dataset and dataloader

dataset = PlagiarismDataset(df, tokenizer)

data_loader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)



# Check the padding token details

print("Padding Token:", tokenizer.pad_token)

print("Padding Token ID:", tokenizer.pad_token_id)

Padding Token: [PAD]
Padding Token ID: 49152


In [9]:
# Set the pad token id in the model's config

model.config.pad_token_id = tokenizer.pad_token_id

In [10]:
# Training function

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)

model = torch.nn.DataParallel(model)



def train_model(model, data_loader, optimizer, loss_fn, epochs=3):

    model.train()

    

    for epoch in range(epochs):

        total_loss = 0

        for batch in tqdm(data_loader, desc=f"Training Epoch {epoch+1}"):

            input_ids = batch['input_ids'].to(device)

            attention_mask = batch['attention_mask'].to(device)

            labels = batch['label'].to(device)

            

            outputs = model(input_ids, attention_mask=attention_mask)

            loss = loss_fn(outputs.logits, labels)

            total_loss += loss.item()

            

            optimizer.zero_grad()

            loss.backward()

            optimizer.step()

        

        avg_loss = total_loss / len(data_loader)

        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss}")

        

    print("Training complete!")



# Evaluation function

def evaluate_model(model, data_loader):

    model.eval()

    correct = 0

    total = 0

    with torch.no_grad():

        for batch in data_loader:

            input_ids = batch['input_ids'].to(device)

            attention_mask = batch['attention_mask'].to(device)

            labels = batch['label'].to(device)

            

            outputs = model(input_ids, attention_mask=attention_mask)

            predictions = torch.argmax(outputs.logits, dim=1)

            correct += (predictions == labels).sum().item()

            total += labels.size(0)

    

    accuracy = correct / total

    print(f"Accuracy: {accuracy:.4f}")

In [11]:
# Set up training

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {device}")

model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

loss_fn = CrossEntropyLoss()

Using device: cuda




In [12]:
# Train the model

train_model(model, data_loader, optimizer, loss_fn, epochs=3)

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Training Epoch 1: 100%|██████████| 22961/22961 [2:25:56<00:00,  2.62it/s]


Epoch 1/3, Loss: 0.1226412019093822


Training Epoch 2: 100%|██████████| 22961/22961 [2:25:56<00:00,  2.62it/s]


Epoch 2/3, Loss: 0.05606692839428341


Training Epoch 3: 100%|██████████| 22961/22961 [2:25:31<00:00,  2.63it/s]

Epoch 3/3, Loss: 0.02446798090389194
Training complete!





In [13]:
# Evaluate the model

evaluate_model(model, data_loader)

Accuracy: 0.9966


In [None]:
save_directory = "/kaggle/working/"

# Save the model and tokenizer to the directory
model_to_save = model.module if hasattr(model, "module") else model

# Save the fine-tuned model
model_to_save.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")

Model and tokenizer saved to /kaggle/working/
