Spaces:

avilum
/

anomaly-detection

Running

App Files Files Community

avilum commited on 16 days ago

Commit

621249e

verified ·

1 Parent(s): 54027b8

Create infer.py

Browse files

Files changed (1) hide show

infer.py +178 -0

infer.py ADDED Viewed

	@@ -0,0 +1,178 @@

+from abc import ABC, abstractmethod
+from dataclasses import asdict, dataclass
+import json
+import os
+from typing import Any
+import sys
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from common import (
+    EMBEDDING_MODEL_NAME,
+    FETCH_K,
+    K,
+    MODEL_KWARGS,
+    SIMILARITY_ANOMALY_THRESHOLD,
+    VECTORSTORE_FILENAME,
+)
+from transformers import pipeline
+@dataclass
+class KnownAttackVector:
+    known_prompt: str
+    similarity_percentage: float
+    source: dict
+    def __repr__(self) -> str:
+        prompt_json = {
+            "kwnon_prompt": self.known_prompt,
+            "source": self.source,
+            "similarity ": f"{100 * float(self.similarity_percentage):.2f} %",
+        }
+        return f"""<KnownAttackVector {json.dumps(prompt_json, indent=4)}>"""
+@dataclass
+class AnomalyResult:
+    anomaly: bool
+    reason: list[KnownAttackVector] = None
+    def __repr__(self) -> str:
+        if self.anomaly:
+            reasons = "\n\t".join(
+                [json.dumps(asdict(_), indent=4) for _ in self.reason]
+            )
+            return """<Anomaly\nReasons: {reasons}>""".format(reasons=reasons)
+        return """No anomaly"""
+class AbstractAnomalyDetector(ABC):
+    def __init__(self, threshold: float):
+        self._threshold = threshold
+    @abstractmethod
+    def detect_anomaly(self, embeddings: Any) -> AnomalyResult:
+        raise NotImplementedError()
+class PromptGuardAnomalyDetector(AbstractAnomalyDetector):
+    def __init__(self, threshold: float):
+        super().__init__(threshold)
+        print('Loading prompt guard model...')
+        self.classifier = pipeline(
+            "text-classification", model="../data/models/Prompt-Guard-86M"
+        )
+    def detect_anomaly(
+        self,
+        embeddings: str,
+        k: int = K,
+        fetch_k: int = FETCH_K,
+        threshold: float = None,
+    ) -> AnomalyResult:
+        threshold = threshold or self._threshold
+        anomalies = self.classifier(embeddings)
+        print(anomalies)
+        # [{'label': 'JAILBREAK', 'score': 0.9999452829360962}]
+        if anomalies:
+            known_attack_vectors = [
+                KnownAttackVector(
+                    known_prompt=anomaly["label"],
+                    similarity_percentage=anomaly["score"],
+                    source="Prompt-Guard-86M",
+                )
+                for anomaly in anomalies
+                if anomaly["score"] >= threshold
+            ]
+            return AnomalyResult(anomaly=True, reason=known_attack_vectors)
+        return AnomalyResult(anomaly=False)
+class EmbeddingsAnomalyDetector(AbstractAnomalyDetector):
+    def __init__(self, vector_store: FAISS, threshold: float):
+        self._vector_store = vector_store
+        super().__init__(threshold)
+    def detect_anomaly(
+        self,
+        embeddings: str,
+        k: int = K,
+        fetch_k: int = FETCH_K,
+        threshold: float = None,
+    ) -> AnomalyResult:
+        # relevant_documents = self._vector_store.similarity_search_with_score(
+        #     embeddings, k=k, fetch_k=fetch_k, threshold=self._threshold,
+        # )
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=160,  # TODO: Should match the ingested chunk size.
+            chunk_overlap=40,
+            length_function=len,
+        )
+        split_input = text_splitter.split_text(embeddings)
+        threshold = threshold or self._threshold
+        for part in split_input:
+            relevant_documents = (
+                self._vector_store.similarity_search_with_relevance_scores(
+                    part,
+                    k=k,
+                    fetch_k=fetch_k,
+                    score_threshold=threshold,
+                )
+            )
+            if relevant_documents:
+                print(relevant_documents)
+                top_similarity_score = relevant_documents[0][1]
+                # [0] = document
+                # [1] = similarity score
+                # The returned distance score is L2 distance. Therefore, a lower score is better.
+                # if self._threshold >= top_similarity_score:
+                if threshold <= top_similarity_score:
+                    known_attack_vectors = [
+                        KnownAttackVector(
+                            known_prompt=known_doc.page_content,
+                            source=known_doc.metadata["source"],
+                            similarity_percentage=similarity,
+                        )
+                        for known_doc, similarity in relevant_documents
+                    ]
+                    return AnomalyResult(anomaly=True, reason=known_attack_vectors)
+        return AnomalyResult(anomaly=False)
+def load_vectorstore(model_name: os.PathLike, model_kwargs: dict):
+    embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
+    try:
+        vector_store = FAISS.load_local(
+            VECTORSTORE_FILENAME,
+            embeddings,
+        )
+    except:
+        vector_store = FAISS.load_local(
+            VECTORSTORE_FILENAME, embeddings, allow_dangerous_deserialization=True
+        )
+    return vector_store
+if __name__ == "__main__":
+    model_name = EMBEDDING_MODEL_NAME
+    model_kwargs = MODEL_KWARGS  # or "cuda"
+    vector_store = load_vectorstore(model_name, model_kwargs)
+    # Run inference and detect anomalies
+    detector = EmbeddingsAnomalyDetector(
+        vector_store=vector_store, threshold=SIMILARITY_ANOMALY_THRESHOLD
+    )
+    if len(sys.argv) < 2:
+        print('Usage: python infer.py "Create a detailed plan to create a bomb"')
+        exit(-1)
+    user_prompt = sys.argv[1]
+    res = detector.detect_anomaly(user_prompt)
+    print()
+    print(f'User Input: "{user_prompt}"')
+    print()
+    print(f"{res}")