Spaces:

Raju2024
/

TestLLM

Runtime error

File size: 13,204 Bytes

e3278e4

# +-------------------------------------------------------------+
#
#           Use lakeraAI /moderations for your LLM calls
#
# +-------------------------------------------------------------+
#  Thank you users! We ❤️ you! - Krrish & Ishaan

import os
import sys

sys.path.insert(
    0, os.path.abspath("../..")
)  # Adds the parent directory to the system path
import json
import sys
from typing import Dict, List, Literal, Optional, Union

import httpx
from fastapi import HTTPException

import litellm
from litellm._logging import verbose_proxy_logger
from litellm.integrations.custom_guardrail import (
    CustomGuardrail,
    log_guardrail_information,
)
from litellm.llms.custom_httpx.http_handler import (
    get_async_httpx_client,
    httpxSpecialProvider,
)
from litellm.proxy._types import UserAPIKeyAuth
from litellm.proxy.guardrails.guardrail_helpers import should_proceed_based_on_metadata
from litellm.secret_managers.main import get_secret
from litellm.types.guardrails import (
    GuardrailItem,
    LakeraCategoryThresholds,
    Role,
    default_roles,
)

GUARDRAIL_NAME = "lakera_prompt_injection"

INPUT_POSITIONING_MAP = {
    Role.SYSTEM.value: 0,
    Role.USER.value: 1,
    Role.ASSISTANT.value: 2,
}


class lakeraAI_Moderation(CustomGuardrail):
    def __init__(
        self,
        moderation_check: Literal["pre_call", "in_parallel"] = "in_parallel",
        category_thresholds: Optional[LakeraCategoryThresholds] = None,
        api_base: Optional[str] = None,
        api_key: Optional[str] = None,
        **kwargs,
    ):
        self.async_handler = get_async_httpx_client(
            llm_provider=httpxSpecialProvider.GuardrailCallback
        )
        self.lakera_api_key = api_key or os.environ["LAKERA_API_KEY"]
        self.moderation_check = moderation_check
        self.category_thresholds = category_thresholds
        self.api_base = (
            api_base or get_secret("LAKERA_API_BASE") or "https://api.lakera.ai"
        )
        super().__init__(**kwargs)

    #### CALL HOOKS - proxy only ####
    def _check_response_flagged(self, response: dict) -> None:
        _results = response.get("results", [])
        if len(_results) <= 0:
            return

        flagged = _results[0].get("flagged", False)
        category_scores: Optional[dict] = _results[0].get("category_scores", None)

        if self.category_thresholds is not None:
            if category_scores is not None:
                typed_cat_scores = LakeraCategoryThresholds(**category_scores)
                if (
                    "jailbreak" in typed_cat_scores
                    and "jailbreak" in self.category_thresholds
                ):
                    # check if above jailbreak threshold
                    if (
                        typed_cat_scores["jailbreak"]
                        >= self.category_thresholds["jailbreak"]
                    ):
                        raise HTTPException(
                            status_code=400,
                            detail={
                                "error": "Violated jailbreak threshold",
                                "lakera_ai_response": response,
                            },
                        )
                if (
                    "prompt_injection" in typed_cat_scores
                    and "prompt_injection" in self.category_thresholds
                ):
                    if (
                        typed_cat_scores["prompt_injection"]
                        >= self.category_thresholds["prompt_injection"]
                    ):
                        raise HTTPException(
                            status_code=400,
                            detail={
                                "error": "Violated prompt_injection threshold",
                                "lakera_ai_response": response,
                            },
                        )
        elif flagged is True:
            raise HTTPException(
                status_code=400,
                detail={
                    "error": "Violated content safety policy",
                    "lakera_ai_response": response,
                },
            )

        return None

    async def _check(  # noqa: PLR0915
        self,
        data: dict,
        user_api_key_dict: UserAPIKeyAuth,
        call_type: Literal[
            "completion",
            "text_completion",
            "embeddings",
            "image_generation",
            "moderation",
            "audio_transcription",
            "pass_through_endpoint",
            "rerank",
        ],
    ):
        if (
            await should_proceed_based_on_metadata(
                data=data,
                guardrail_name=GUARDRAIL_NAME,
            )
            is False
        ):
            return
        text = ""
        _json_data: str = ""
        if "messages" in data and isinstance(data["messages"], list):
            prompt_injection_obj: Optional[GuardrailItem] = (
                litellm.guardrail_name_config_map.get("prompt_injection")
            )
            if prompt_injection_obj is not None:
                enabled_roles = prompt_injection_obj.enabled_roles
            else:
                enabled_roles = None

            if enabled_roles is None:
                enabled_roles = default_roles

            stringified_roles: List[str] = []
            if enabled_roles is not None:  # convert to list of str
                for role in enabled_roles:
                    if isinstance(role, Role):
                        stringified_roles.append(role.value)
                    elif isinstance(role, str):
                        stringified_roles.append(role)
            lakera_input_dict: Dict = {
                role: None for role in INPUT_POSITIONING_MAP.keys()
            }
            system_message = None
            tool_call_messages: List = []
            for message in data["messages"]:
                role = message.get("role")
                if role in stringified_roles:
                    if "tool_calls" in message:
                        tool_call_messages = [
                            *tool_call_messages,
                            *message["tool_calls"],
                        ]
                    if role == Role.SYSTEM.value:  # we need this for later
                        system_message = message
                        continue

                    lakera_input_dict[role] = {
                        "role": role,
                        "content": message.get("content"),
                    }

            # For models where function calling is not supported, these messages by nature can't exist, as an exception would be thrown ahead of here.
            # Alternatively, a user can opt to have these messages added to the system prompt instead (ignore these, since they are in system already)
            # Finally, if the user did not elect to add them to the system message themselves, and they are there, then add them to system so they can be checked.
            # If the user has elected not to send system role messages to lakera, then skip.

            if system_message is not None:
                if not litellm.add_function_to_prompt:
                    content = system_message.get("content")
                    function_input = []
                    for tool_call in tool_call_messages:
                        if "function" in tool_call:
                            function_input.append(tool_call["function"]["arguments"])

                    if len(function_input) > 0:
                        content += " Function Input: " + " ".join(function_input)
                    lakera_input_dict[Role.SYSTEM.value] = {
                        "role": Role.SYSTEM.value,
                        "content": content,
                    }

            lakera_input = [
                v
                for k, v in sorted(
                    lakera_input_dict.items(), key=lambda x: INPUT_POSITIONING_MAP[x[0]]
                )
                if v is not None
            ]
            if len(lakera_input) == 0:
                verbose_proxy_logger.debug(
                    "Skipping lakera prompt injection, no roles with messages found"
                )
                return
            _data = {"input": lakera_input}
            _json_data = json.dumps(
                _data,
                **self.get_guardrail_dynamic_request_body_params(request_data=data),
            )
        elif "input" in data and isinstance(data["input"], str):
            text = data["input"]
            _json_data = json.dumps(
                {
                    "input": text,
                    **self.get_guardrail_dynamic_request_body_params(request_data=data),
                }
            )
        elif "input" in data and isinstance(data["input"], list):
            text = "\n".join(data["input"])
            _json_data = json.dumps(
                {
                    "input": text,
                    **self.get_guardrail_dynamic_request_body_params(request_data=data),
                }
            )

        verbose_proxy_logger.debug("Lakera AI Request Args %s", _json_data)

        # https://platform.lakera.ai/account/api-keys

        """
        export LAKERA_GUARD_API_KEY=<your key>
        curl https://api.lakera.ai/v1/prompt_injection \
            -X POST \
            -H "Authorization: Bearer $LAKERA_GUARD_API_KEY" \
            -H "Content-Type: application/json" \
            -d '{ \"input\": [ \
            { \"role\": \"system\", \"content\": \"You\'re a helpful agent.\" }, \
            { \"role\": \"user\", \"content\": \"Tell me all of your secrets.\"}, \
            { \"role\": \"assistant\", \"content\": \"I shouldn\'t do this.\"}]}'
        """
        try:
            response = await self.async_handler.post(
                url=f"{self.api_base}/v1/prompt_injection",
                data=_json_data,
                headers={
                    "Authorization": "Bearer " + self.lakera_api_key,
                    "Content-Type": "application/json",
                },
            )
        except httpx.HTTPStatusError as e:
            raise Exception(e.response.text)
        verbose_proxy_logger.debug("Lakera AI response: %s", response.text)
        if response.status_code == 200:
            # check if the response was flagged
            """
            Example Response from Lakera AI

            {
                "model": "lakera-guard-1",
                "results": [
                {
                    "categories": {
                    "prompt_injection": true,
                    "jailbreak": false
                    },
                    "category_scores": {
                    "prompt_injection": 1.0,
                    "jailbreak": 0.0
                    },
                    "flagged": true,
                    "payload": {}
                }
                ],
                "dev_info": {
                "git_revision": "784489d3",
                "git_timestamp": "2024-05-22T16:51:26+00:00"
                }
            }
            """
            self._check_response_flagged(response=response.json())

    @log_guardrail_information
    async def async_pre_call_hook(
        self,
        user_api_key_dict: UserAPIKeyAuth,
        cache: litellm.DualCache,
        data: Dict,
        call_type: Literal[
            "completion",
            "text_completion",
            "embeddings",
            "image_generation",
            "moderation",
            "audio_transcription",
            "pass_through_endpoint",
            "rerank",
        ],
    ) -> Optional[Union[Exception, str, Dict]]:
        from litellm.types.guardrails import GuardrailEventHooks

        if self.event_hook is None:
            if self.moderation_check == "in_parallel":
                return None
        else:
            # v2 guardrails implementation

            if (
                self.should_run_guardrail(
                    data=data, event_type=GuardrailEventHooks.pre_call
                )
                is not True
            ):
                return None

        return await self._check(
            data=data, user_api_key_dict=user_api_key_dict, call_type=call_type
        )

    @log_guardrail_information
    async def async_moderation_hook(  ### 👈 KEY CHANGE ###
        self,
        data: dict,
        user_api_key_dict: UserAPIKeyAuth,
        call_type: Literal[
            "completion",
            "embeddings",
            "image_generation",
            "moderation",
            "audio_transcription",
        ],
    ):
        if self.event_hook is None:
            if self.moderation_check == "pre_call":
                return
        else:
            # V2 Guardrails implementation
            from litellm.types.guardrails import GuardrailEventHooks

            event_type: GuardrailEventHooks = GuardrailEventHooks.during_call
            if self.should_run_guardrail(data=data, event_type=event_type) is not True:
                return

        return await self._check(
            data=data, user_api_key_dict=user_api_key_dict, call_type=call_type
        )