Spaces:

wandb
/

guardrails-genie

Running

App Files Files Community

guardrails-genie / guardrails_genie /guardrails /injection /llama_prompt_guardrail.py

geekyrakshit

update: docs

a70d6a8 7 months ago

raw

history blame

4.62 kB

	from typing import Optional

	import torch
	import torch.nn.functional as F
	import weave
	from transformers import AutoModelForSequenceClassification, AutoTokenizer

	from ..base import Guardrail


	class PromptInjectionLlamaGuardrail(Guardrail):
	"""
	A guardrail class designed to detect and mitigate prompt injection attacks
	using a pre-trained language model. This class leverages a sequence
	classification model to evaluate prompts for potential security threats
	such as jailbreak attempts and indirect injection attempts.

	Attributes:
	model_name (str): The name of the pre-trained model used for sequence
	classification.
	max_sequence_length (int): The maximum length of the input sequence
	for the tokenizer.
	temperature (float): A scaling factor for the model's logits to
	control the randomness of predictions.
	jailbreak_score_threshold (float): The threshold above which a prompt
	is considered a jailbreak attempt.
	indirect_injection_score_threshold (float): The threshold above which
	a prompt is considered an indirect injection attempt.
	"""

	model_name: str = "meta-llama/Prompt-Guard-86M"
	max_sequence_length: int = 512
	temperature: float = 1.0
	jailbreak_score_threshold: float = 0.5
	indirect_injection_score_threshold: float = 0.5
	_tokenizer: Optional[AutoTokenizer] = None
	_model: Optional[AutoModelForSequenceClassification] = None

	def model_post_init(self, __context):
	self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
	self._model = AutoModelForSequenceClassification.from_pretrained(
	self.model_name
	)

	def get_class_probabilities(self, prompt):
	inputs = self._tokenizer(
	prompt,
	return_tensors="pt",
	padding=True,
	truncation=True,
	max_length=self.max_sequence_length,
	)
	with torch.no_grad():
	logits = self._model(**inputs).logits
	scaled_logits = logits / self.temperature
	probabilities = F.softmax(scaled_logits, dim=-1)
	return probabilities

	@weave.op()
	def get_score(self, prompt: str):
	probabilities = self.get_class_probabilities(prompt)
	return {
	"jailbreak_score": probabilities[0, 2].item(),
	"indirect_injection_score": (
	probabilities[0, 1] + probabilities[0, 2]
	).item(),
	}

	"""
	Analyzes a given prompt to determine its safety by evaluating the likelihood
	of it being a jailbreak or indirect injection attempt.

	This function utilizes the `get_score` method to obtain the probabilities
	associated with the prompt being a jailbreak or indirect injection attempt.
	It then compares these probabilities against predefined thresholds to assess
	the prompt's safety. If the `jailbreak_score` exceeds the `jailbreak_score_threshold`,
	the prompt is flagged as a potential jailbreak attempt, and a confidence level
	is calculated and included in the summary. Similarly, if the `indirect_injection_score`
	surpasses the `indirect_injection_score_threshold`, the prompt is flagged as a potential
	indirect injection attempt, with its confidence level also included in the summary.

	Returns a dictionary containing:
	- "safe": A boolean indicating whether the prompt is considered safe
	(i.e., both scores are below their respective thresholds).
	- "summary": A string summarizing the findings, including confidence levels
	for any detected threats.
	"""

	@weave.op()
	def guard(self, prompt: str):
	score = self.get_score(prompt)
	summary = ""
	if score["jailbreak_score"] > self.jailbreak_score_threshold:
	confidence = round(score["jailbreak_score"] * 100, 2)
	summary += f"Prompt is deemed to be a jailbreak attempt with {confidence}% confidence."
	if score["indirect_injection_score"] > self.indirect_injection_score_threshold:
	confidence = round(score["indirect_injection_score"] * 100, 2)
	summary += f" Prompt is deemed to be an indirect injection attempt with {confidence}% confidence."
	return {
	"safe": score["jailbreak_score"] < self.jailbreak_score_threshold
	and score["indirect_injection_score"]
	< self.indirect_injection_score_threshold,
	"summary": summary.strip(),
	}

	@weave.op()
	def predict(self, prompt: str):
	return self.guard(prompt)