geekyrakshit's picture
update: docs
a70d6a8
raw
history blame
4.62 kB
from typing import Optional
import torch
import torch.nn.functional as F
import weave
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from ..base import Guardrail
class PromptInjectionLlamaGuardrail(Guardrail):
"""
A guardrail class designed to detect and mitigate prompt injection attacks
using a pre-trained language model. This class leverages a sequence
classification model to evaluate prompts for potential security threats
such as jailbreak attempts and indirect injection attempts.
Attributes:
model_name (str): The name of the pre-trained model used for sequence
classification.
max_sequence_length (int): The maximum length of the input sequence
for the tokenizer.
temperature (float): A scaling factor for the model's logits to
control the randomness of predictions.
jailbreak_score_threshold (float): The threshold above which a prompt
is considered a jailbreak attempt.
indirect_injection_score_threshold (float): The threshold above which
a prompt is considered an indirect injection attempt.
"""
model_name: str = "meta-llama/Prompt-Guard-86M"
max_sequence_length: int = 512
temperature: float = 1.0
jailbreak_score_threshold: float = 0.5
indirect_injection_score_threshold: float = 0.5
_tokenizer: Optional[AutoTokenizer] = None
_model: Optional[AutoModelForSequenceClassification] = None
def model_post_init(self, __context):
self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self._model = AutoModelForSequenceClassification.from_pretrained(
self.model_name
)
def get_class_probabilities(self, prompt):
inputs = self._tokenizer(
prompt,
return_tensors="pt",
padding=True,
truncation=True,
max_length=self.max_sequence_length,
)
with torch.no_grad():
logits = self._model(**inputs).logits
scaled_logits = logits / self.temperature
probabilities = F.softmax(scaled_logits, dim=-1)
return probabilities
@weave.op()
def get_score(self, prompt: str):
probabilities = self.get_class_probabilities(prompt)
return {
"jailbreak_score": probabilities[0, 2].item(),
"indirect_injection_score": (
probabilities[0, 1] + probabilities[0, 2]
).item(),
}
"""
Analyzes a given prompt to determine its safety by evaluating the likelihood
of it being a jailbreak or indirect injection attempt.
This function utilizes the `get_score` method to obtain the probabilities
associated with the prompt being a jailbreak or indirect injection attempt.
It then compares these probabilities against predefined thresholds to assess
the prompt's safety. If the `jailbreak_score` exceeds the `jailbreak_score_threshold`,
the prompt is flagged as a potential jailbreak attempt, and a confidence level
is calculated and included in the summary. Similarly, if the `indirect_injection_score`
surpasses the `indirect_injection_score_threshold`, the prompt is flagged as a potential
indirect injection attempt, with its confidence level also included in the summary.
Returns a dictionary containing:
- "safe": A boolean indicating whether the prompt is considered safe
(i.e., both scores are below their respective thresholds).
- "summary": A string summarizing the findings, including confidence levels
for any detected threats.
"""
@weave.op()
def guard(self, prompt: str):
score = self.get_score(prompt)
summary = ""
if score["jailbreak_score"] > self.jailbreak_score_threshold:
confidence = round(score["jailbreak_score"] * 100, 2)
summary += f"Prompt is deemed to be a jailbreak attempt with {confidence}% confidence."
if score["indirect_injection_score"] > self.indirect_injection_score_threshold:
confidence = round(score["indirect_injection_score"] * 100, 2)
summary += f" Prompt is deemed to be an indirect injection attempt with {confidence}% confidence."
return {
"safe": score["jailbreak_score"] < self.jailbreak_score_threshold
and score["indirect_injection_score"]
< self.indirect_injection_score_threshold,
"summary": summary.strip(),
}
@weave.op()
def predict(self, prompt: str):
return self.guard(prompt)