avilum commited on
Commit
294fe68
·
verified ·
1 Parent(s): 0b54c92

Update infer.py

Browse files
Files changed (1) hide show
  1. infer.py +9 -2
infer.py CHANGED
@@ -75,16 +75,23 @@ class PromptGuardAnomalyDetector(AbstractAnomalyDetector):
75
  threshold = threshold or self._threshold
76
  anomalies = self.classifier(embeddings)
77
  print(anomalies)
 
78
  # [{'label': 'JAILBREAK', 'score': 0.9999452829360962}]
 
 
 
 
 
 
79
  if anomalies:
80
  known_attack_vectors = [
81
  KnownAttackVector(
82
- known_prompt=anomaly["label"],
83
  similarity_percentage=anomaly["score"],
84
  source="meta-llama/Llama-Prompt-Guard-2-86M",
85
  )
86
  for anomaly in anomalies
87
- if anomaly["score"] >= threshold
88
  ]
89
  return AnomalyResult(anomaly=True, reason=known_attack_vectors)
90
  return AnomalyResult(anomaly=False)
 
75
  threshold = threshold or self._threshold
76
  anomalies = self.classifier(embeddings)
77
  print(anomalies)
78
+ # promptguard 1
79
  # [{'label': 'JAILBREAK', 'score': 0.9999452829360962}]
80
+
81
+ # promptguard 2
82
+ # [{'label': 'LABEL_0', 'score': 0.9999452829360962}]
83
+ # [{'label': 'LABEL_1', 'score': 0.9999452829360962}]
84
+ # "LABEL_0" (Negative classification, benign)
85
+ # "LABEL_1" (Positive classification, malicious)
86
  if anomalies:
87
  known_attack_vectors = [
88
  KnownAttackVector(
89
+ known_prompt="PromptGuard detected anomaly",
90
  similarity_percentage=anomaly["score"],
91
  source="meta-llama/Llama-Prompt-Guard-2-86M",
92
  )
93
  for anomaly in anomalies
94
+ if anomaly["score"] >= threshold and anomaly["label"] == "LABEL_1" # LABEL_0 is negative == benign
95
  ]
96
  return AnomalyResult(anomaly=True, reason=known_attack_vectors)
97
  return AnomalyResult(anomaly=False)