In [1]:
from transformers import GPT2Tokenizer, LlamaForSequenceClassification

# Load the GPT2 tokenizer and Llama model for sequence classification
model_path = r"C:\Users\jatin\OneDrive\Desktop\plagiarism-detection\smolLM-fined-tuned-for-PLAGAIRISM-Detection\model"
tokenizer = GPT2Tokenizer.from_pretrained(model_path, local_files_only=True)
model = LlamaForSequenceClassification.from_pretrained(model_path, local_files_only=True)

# Set model to evaluation mode
model.eval()

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(49153, 576, padding_idx=49152)
    (layers): ModuleList(
      (0-29): 30 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=576, out_features=576, bias=False)
          (k_proj): Linear(in_features=576, out_features=192, bias=False)
          (v_proj): Linear(in_features=576, out_features=192, bias=False)
          (o_proj): Linear(in_features=576, out_features=576, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
          (up_proj): Linear(in_features=576, out_features=1536, bias=False)
          (down_proj): Linear(in_features=1536, out_features=576, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
     

In [2]:
import torch
import pandas as pd

df = pd.read_csv("train_snli.txt", delimiter='\t', header=None, names=['sentence1', 'sentence2', 'label'])

df.head()

Unnamed: 0,sentence1,sentence2,label
0,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",0
1,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",1
2,Children smiling and waving at camera,There are children present,1
3,Children smiling and waving at camera,The kids are frowning,0
4,A boy is jumping on skateboard in the middle o...,The boy skates down the sidewalk.,0


In [3]:
from torch.utils.data import Dataset, DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class PlagiarismDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        row = self.df.iloc[index]

        # Ensure the sentences are strings; convert or skip if not
        sentence1 = str(row['sentence1']) if not pd.isna(row['sentence1']) else ""
        sentence2 = str(row['sentence2']) if not pd.isna(row['sentence2']) else ""

        inputs = self.tokenizer(
            sentence1, sentence2,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        label = torch.tensor(row['label'], dtype=torch.long)

        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'label': label
        }

def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_masks = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.stack([item['label'] for item in batch])

    return {
        'input_ids': input_ids,
        'attention_mask': attention_masks,
        'label': labels
    }

In [4]:
device

device(type='cuda')

In [5]:
# Assuming you have a separate test set or validation set (e.g., df_test)
df_test = df[3_66_900:]
# Add padding token if not already
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Resize the model's token embeddings to fit the new tokenizer
model.resize_token_embeddings(len(tokenizer))

# Create DataLoader for the test set
test_dataset = PlagiarismDataset(df_test, tokenizer)
test_data_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

In [6]:
from sklearn.metrics import classification_report
# Function to evaluate model on the test set
# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to the appropriate device
model = model.to(device)

# Function to evaluate the model
def evaluate_model(model, data_loader):
    model.eval()  # Set model to evaluation mode
    preds_list = []
    labels_list = []

    with torch.no_grad():  # Disable gradient calculation for evaluation
        for batch in data_loader:
            # Move input tensors to the same device as the model
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            # Get model outputs
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            # Append predictions and true labels to respective lists
            preds_list.extend(preds.cpu().numpy())
            labels_list.extend(labels.cpu().numpy())
    
    # Compute evaluation metrics
    from sklearn.metrics import classification_report
    report = classification_report(labels_list, preds_list)
    print("Classification Report:\n", report)

# Evaluate the model
evaluate_model(model, test_data_loader)

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       236
           1       1.00      1.00      1.00       237

    accuracy                           1.00       473
   macro avg       1.00      1.00      1.00       473
weighted avg       1.00      1.00      1.00       473

