File size: 1,580 Bytes
729b0f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import argparse
import numpy as np
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_from_disk

def compute_metrics(predictions, labels):
    accuracy = (np.array(predictions) == np.array(labels)).mean()
    return {"accuracy": accuracy}

def preprocess_function(examples, tokenizer):
    return tokenizer(examples["text"], truncation=True, return_tensors="pt")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Parse command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, required=True)
args = parser.parse_args()

# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(
    args.model, num_labels=2, id2label={0: "safe", 1: "jailbreak"}, label2id={"safe": 0, "jailbreak": 1}
).to(device)
tokenizer = AutoTokenizer.from_pretrained(args.model)

# Load test data
test_data = load_from_disk("test_data")

# Generate predictions and references
references = [example["label"] for example in test_data]
predictions = []
from tqdm import tqdm
for example in tqdm(test_data, total=len(test_data)):
    inputs = preprocess_function(example, tokenizer)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move inputs to GPU
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).item()
    predictions.append(prediction)

# Compute the metrics
metrics = compute_metrics(predictions, references)
print("Accuracy: ", metrics["accuracy"])