geekyrakshit commited on
Commit
a6ca408
·
1 Parent(s): 6779bd2

update: PromptInjectionLlamaGuardrail

Browse files
guardrails_genie/guardrails/injection/llama_prompt_guardrail.py CHANGED
@@ -14,7 +14,6 @@ class PromptInjectionLlamaGuardrail(Guardrail):
14
  temperature: float = 1.0
15
  jailbreak_score_threshold: float = 0.5
16
  indirect_injection_score_threshold: float = 0.5
17
- device: str = "cuda" if torch.cuda.is_available() else "cpu"
18
  _tokenizer: Optional[AutoTokenizer] = None
19
  _model: Optional[AutoModelForSequenceClassification] = None
20
 
@@ -32,7 +31,6 @@ class PromptInjectionLlamaGuardrail(Guardrail):
32
  truncation=True,
33
  max_length=self.max_sequence_length,
34
  )
35
- inputs = inputs.to(self.device)
36
  with torch.no_grad():
37
  logits = self._model(**inputs).logits
38
  scaled_logits = logits / self.temperature
@@ -58,12 +56,12 @@ class PromptInjectionLlamaGuardrail(Guardrail):
58
  summary += f"Prompt is deemed to be a jailbreak attempt with {confidence}% confidence."
59
  if score["indirect_injection_score"] > self.indirect_injection_score_threshold:
60
  confidence = round(score["indirect_injection_score"] * 100, 2)
61
- summary += f"Prompt is deemed to be an indirect injection attempt with {confidence}% confidence."
62
  return {
63
  "safe": score["jailbreak_score"] < self.jailbreak_score_threshold
64
  and score["indirect_injection_score"]
65
  < self.indirect_injection_score_threshold,
66
- "summary": summary,
67
  }
68
 
69
  @weave.op()
 
14
  temperature: float = 1.0
15
  jailbreak_score_threshold: float = 0.5
16
  indirect_injection_score_threshold: float = 0.5
 
17
  _tokenizer: Optional[AutoTokenizer] = None
18
  _model: Optional[AutoModelForSequenceClassification] = None
19
 
 
31
  truncation=True,
32
  max_length=self.max_sequence_length,
33
  )
 
34
  with torch.no_grad():
35
  logits = self._model(**inputs).logits
36
  scaled_logits = logits / self.temperature
 
56
  summary += f"Prompt is deemed to be a jailbreak attempt with {confidence}% confidence."
57
  if score["indirect_injection_score"] > self.indirect_injection_score_threshold:
58
  confidence = round(score["indirect_injection_score"] * 100, 2)
59
+ summary += f" Prompt is deemed to be an indirect injection attempt with {confidence}% confidence."
60
  return {
61
  "safe": score["jailbreak_score"] < self.jailbreak_score_threshold
62
  and score["indirect_injection_score"]
63
  < self.indirect_injection_score_threshold,
64
+ "summary": summary.strip(),
65
  }
66
 
67
  @weave.op()