Spaces:

avilum
/

anomaly-detection

Running

avilum commited on 9 days ago

Commit

294fe68

verified ·

1 Parent(s): 0b54c92

Update infer.py

Files changed (1) hide show

infer.py CHANGED Viewed

@@ -75,16 +75,23 @@ class PromptGuardAnomalyDetector(AbstractAnomalyDetector):
         threshold = threshold or self._threshold
         anomalies = self.classifier(embeddings)
         print(anomalies)
         # [{'label': 'JAILBREAK', 'score': 0.9999452829360962}]
         if anomalies:
             known_attack_vectors = [
                 KnownAttackVector(
-                    known_prompt=anomaly["label"],
                     similarity_percentage=anomaly["score"],
                     source="meta-llama/Llama-Prompt-Guard-2-86M",
                 )
                 for anomaly in anomalies
-                if anomaly["score"] >= threshold
             ]
             return AnomalyResult(anomaly=True, reason=known_attack_vectors)
         return AnomalyResult(anomaly=False)

         threshold = threshold or self._threshold
         anomalies = self.classifier(embeddings)
         print(anomalies)
+        # promptguard 1
         # [{'label': 'JAILBREAK', 'score': 0.9999452829360962}]
+        # promptguard 2
+        # [{'label': 'LABEL_0', 'score': 0.9999452829360962}]
+        # [{'label': 'LABEL_1', 'score': 0.9999452829360962}]
+            # "LABEL_0" (Negative classification, benign)
+            # "LABEL_1" (Positive classification, malicious)
         if anomalies:
             known_attack_vectors = [
                 KnownAttackVector(
+                    known_prompt="PromptGuard detected anomaly",
                     similarity_percentage=anomaly["score"],
                     source="meta-llama/Llama-Prompt-Guard-2-86M",
                 )
                 for anomaly in anomalies
+                if anomaly["score"] >= threshold and anomaly["label"] == "LABEL_1" # LABEL_0 is negative == benign
             ]
             return AnomalyResult(anomaly=True, reason=known_attack_vectors)
         return AnomalyResult(anomaly=False)