Spaces:

wandb
/

guardrails-genie

Running

File size: 4,617 Bytes

from typing import Optional

import torch
import torch.nn.functional as F
import weave
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from ..base import Guardrail


class PromptInjectionLlamaGuardrail(Guardrail):
    """
    A guardrail class designed to detect and mitigate prompt injection attacks
    using a pre-trained language model. This class leverages a sequence
    classification model to evaluate prompts for potential security threats
    such as jailbreak attempts and indirect injection attempts.

    Attributes:
        model_name (str): The name of the pre-trained model used for sequence
            classification.
        max_sequence_length (int): The maximum length of the input sequence
            for the tokenizer.
        temperature (float): A scaling factor for the model's logits to
            control the randomness of predictions.
        jailbreak_score_threshold (float): The threshold above which a prompt
            is considered a jailbreak attempt.
        indirect_injection_score_threshold (float): The threshold above which
            a prompt is considered an indirect injection attempt.
    """

    model_name: str = "meta-llama/Prompt-Guard-86M"
    max_sequence_length: int = 512
    temperature: float = 1.0
    jailbreak_score_threshold: float = 0.5
    indirect_injection_score_threshold: float = 0.5
    _tokenizer: Optional[AutoTokenizer] = None
    _model: Optional[AutoModelForSequenceClassification] = None

    def model_post_init(self, __context):
        self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self._model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name
        )

    def get_class_probabilities(self, prompt):
        inputs = self._tokenizer(
            prompt,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=self.max_sequence_length,
        )
        with torch.no_grad():
            logits = self._model(**inputs).logits
        scaled_logits = logits / self.temperature
        probabilities = F.softmax(scaled_logits, dim=-1)
        return probabilities

    @weave.op()
    def get_score(self, prompt: str):
        probabilities = self.get_class_probabilities(prompt)
        return {
            "jailbreak_score": probabilities[0, 2].item(),
            "indirect_injection_score": (
                probabilities[0, 1] + probabilities[0, 2]
            ).item(),
        }

    """
    Analyzes a given prompt to determine its safety by evaluating the likelihood
    of it being a jailbreak or indirect injection attempt.

    This function utilizes the `get_score` method to obtain the probabilities
    associated with the prompt being a jailbreak or indirect injection attempt.
    It then compares these probabilities against predefined thresholds to assess
    the prompt's safety. If the `jailbreak_score` exceeds the `jailbreak_score_threshold`,
    the prompt is flagged as a potential jailbreak attempt, and a confidence level
    is calculated and included in the summary. Similarly, if the `indirect_injection_score`
    surpasses the `indirect_injection_score_threshold`, the prompt is flagged as a potential
    indirect injection attempt, with its confidence level also included in the summary.

    Returns a dictionary containing:
        - "safe": A boolean indicating whether the prompt is considered safe
          (i.e., both scores are below their respective thresholds).
        - "summary": A string summarizing the findings, including confidence levels
          for any detected threats.
    """

    @weave.op()
    def guard(self, prompt: str):
        score = self.get_score(prompt)
        summary = ""
        if score["jailbreak_score"] > self.jailbreak_score_threshold:
            confidence = round(score["jailbreak_score"] * 100, 2)
            summary += f"Prompt is deemed to be a jailbreak attempt with {confidence}% confidence."
        if score["indirect_injection_score"] > self.indirect_injection_score_threshold:
            confidence = round(score["indirect_injection_score"] * 100, 2)
            summary += f" Prompt is deemed to be an indirect injection attempt with {confidence}% confidence."
        return {
            "safe": score["jailbreak_score"] < self.jailbreak_score_threshold
            and score["indirect_injection_score"]
            < self.indirect_injection_score_threshold,
            "summary": summary.strip(),
        }

    @weave.op()
    def predict(self, prompt: str):
        return self.guard(prompt)