|
import argparse |
|
import numpy as np |
|
import torch |
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer |
|
from datasets import load_from_disk |
|
|
|
def compute_metrics(predictions, labels): |
|
accuracy = (np.array(predictions) == np.array(labels)).mean() |
|
return {"accuracy": accuracy} |
|
|
|
def preprocess_function(examples, tokenizer): |
|
return tokenizer(examples["text"], truncation=True, return_tensors="pt") |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--model", type=str, required=True) |
|
args = parser.parse_args() |
|
|
|
|
|
model = AutoModelForSequenceClassification.from_pretrained( |
|
args.model, num_labels=2, id2label={0: "safe", 1: "jailbreak"}, label2id={"safe": 0, "jailbreak": 1} |
|
).to(device) |
|
tokenizer = AutoTokenizer.from_pretrained(args.model) |
|
|
|
|
|
test_data = load_from_disk("test_data") |
|
|
|
|
|
references = [example["label"] for example in test_data] |
|
predictions = [] |
|
from tqdm import tqdm |
|
for example in tqdm(test_data, total=len(test_data)): |
|
inputs = preprocess_function(example, tokenizer) |
|
inputs = {k: v.to(device) for k, v in inputs.items()} |
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
logits = outputs.logits |
|
prediction = torch.argmax(logits, dim=1).item() |
|
predictions.append(prediction) |
|
|
|
|
|
metrics = compute_metrics(predictions, references) |
|
print("Accuracy: ", metrics["accuracy"]) |
|
|
|
|