Spaces:
Running
Running
Update infer.py
Browse files
infer.py
CHANGED
@@ -75,16 +75,23 @@ class PromptGuardAnomalyDetector(AbstractAnomalyDetector):
|
|
75 |
threshold = threshold or self._threshold
|
76 |
anomalies = self.classifier(embeddings)
|
77 |
print(anomalies)
|
|
|
78 |
# [{'label': 'JAILBREAK', 'score': 0.9999452829360962}]
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
if anomalies:
|
80 |
known_attack_vectors = [
|
81 |
KnownAttackVector(
|
82 |
-
known_prompt=anomaly
|
83 |
similarity_percentage=anomaly["score"],
|
84 |
source="meta-llama/Llama-Prompt-Guard-2-86M",
|
85 |
)
|
86 |
for anomaly in anomalies
|
87 |
-
if anomaly["score"] >= threshold
|
88 |
]
|
89 |
return AnomalyResult(anomaly=True, reason=known_attack_vectors)
|
90 |
return AnomalyResult(anomaly=False)
|
|
|
75 |
threshold = threshold or self._threshold
|
76 |
anomalies = self.classifier(embeddings)
|
77 |
print(anomalies)
|
78 |
+
# promptguard 1
|
79 |
# [{'label': 'JAILBREAK', 'score': 0.9999452829360962}]
|
80 |
+
|
81 |
+
# promptguard 2
|
82 |
+
# [{'label': 'LABEL_0', 'score': 0.9999452829360962}]
|
83 |
+
# [{'label': 'LABEL_1', 'score': 0.9999452829360962}]
|
84 |
+
# "LABEL_0" (Negative classification, benign)
|
85 |
+
# "LABEL_1" (Positive classification, malicious)
|
86 |
if anomalies:
|
87 |
known_attack_vectors = [
|
88 |
KnownAttackVector(
|
89 |
+
known_prompt="PromptGuard detected anomaly",
|
90 |
similarity_percentage=anomaly["score"],
|
91 |
source="meta-llama/Llama-Prompt-Guard-2-86M",
|
92 |
)
|
93 |
for anomaly in anomalies
|
94 |
+
if anomaly["score"] >= threshold and anomaly["label"] == "LABEL_1" # LABEL_0 is negative == benign
|
95 |
]
|
96 |
return AnomalyResult(anomaly=True, reason=known_attack_vectors)
|
97 |
return AnomalyResult(anomaly=False)
|