TestLLM / litellm /types /router.py
Raju2024's picture
Upload 1072 files
e3278e4 verified
raw
history blame
23.2 kB
"""
litellm.Router Types - includes RouterConfig, UpdateRouterConfig, ModelInfo etc
"""
import datetime
import enum
import uuid
from typing import Any, Dict, List, Literal, Optional, Tuple, Union, get_type_hints
import httpx
from pydantic import BaseModel, ConfigDict, Field
from typing_extensions import Required, TypedDict
from ..exceptions import RateLimitError
from .completion import CompletionRequest
from .embedding import EmbeddingRequest
from .utils import ModelResponse, ProviderSpecificModelInfo
class ConfigurableClientsideParamsCustomAuth(TypedDict):
api_base: str
CONFIGURABLE_CLIENTSIDE_AUTH_PARAMS = Optional[
List[Union[str, ConfigurableClientsideParamsCustomAuth]]
]
class ModelConfig(BaseModel):
model_name: str
litellm_params: Union[CompletionRequest, EmbeddingRequest]
tpm: int
rpm: int
model_config = ConfigDict(protected_namespaces=())
class RouterConfig(BaseModel):
model_list: List[ModelConfig]
redis_url: Optional[str] = None
redis_host: Optional[str] = None
redis_port: Optional[int] = None
redis_password: Optional[str] = None
cache_responses: Optional[bool] = False
cache_kwargs: Optional[Dict] = {}
caching_groups: Optional[List[Tuple[str, List[str]]]] = None
client_ttl: Optional[int] = 3600
num_retries: Optional[int] = 0
timeout: Optional[float] = None
default_litellm_params: Optional[Dict[str, str]] = {}
set_verbose: Optional[bool] = False
fallbacks: Optional[List] = []
allowed_fails: Optional[int] = None
context_window_fallbacks: Optional[List] = []
model_group_alias: Optional[Dict[str, List[str]]] = {}
retry_after: Optional[int] = 0
routing_strategy: Literal[
"simple-shuffle",
"least-busy",
"usage-based-routing",
"latency-based-routing",
] = "simple-shuffle"
model_config = ConfigDict(protected_namespaces=())
class UpdateRouterConfig(BaseModel):
"""
Set of params that you can modify via `router.update_settings()`.
"""
routing_strategy_args: Optional[dict] = None
routing_strategy: Optional[str] = None
model_group_retry_policy: Optional[dict] = None
allowed_fails: Optional[int] = None
cooldown_time: Optional[float] = None
num_retries: Optional[int] = None
timeout: Optional[float] = None
max_retries: Optional[int] = None
retry_after: Optional[float] = None
fallbacks: Optional[List[dict]] = None
context_window_fallbacks: Optional[List[dict]] = None
model_config = ConfigDict(protected_namespaces=())
class ModelInfo(BaseModel):
id: Optional[
str
] # Allow id to be optional on input, but it will always be present as a str in the model instance
db_model: bool = (
False # used for proxy - to separate models which are stored in the db vs. config.
)
updated_at: Optional[datetime.datetime] = None
updated_by: Optional[str] = None
created_at: Optional[datetime.datetime] = None
created_by: Optional[str] = None
base_model: Optional[str] = (
None # specify if the base model is azure/gpt-3.5-turbo etc for accurate cost tracking
)
tier: Optional[Literal["free", "paid"]] = None
team_id: Optional[str] = None # the team id that this model belongs to
def __init__(self, id: Optional[Union[str, int]] = None, **params):
if id is None:
id = str(uuid.uuid4()) # Generate a UUID if id is None or not provided
elif isinstance(id, int):
id = str(id)
super().__init__(id=id, **params)
model_config = ConfigDict(extra="allow")
def __contains__(self, key):
# Define custom behavior for the 'in' operator
return hasattr(self, key)
def get(self, key, default=None):
# Custom .get() method to access attributes with a default value if the attribute doesn't exist
return getattr(self, key, default)
def __getitem__(self, key):
# Allow dictionary-style access to attributes
return getattr(self, key)
def __setitem__(self, key, value):
# Allow dictionary-style assignment of attributes
setattr(self, key, value)
class GenericLiteLLMParams(BaseModel):
"""
LiteLLM Params without 'model' arg (used across completion / assistants api)
"""
custom_llm_provider: Optional[str] = None
tpm: Optional[int] = None
rpm: Optional[int] = None
api_key: Optional[str] = None
api_base: Optional[str] = None
api_version: Optional[str] = None
timeout: Optional[Union[float, str, httpx.Timeout]] = (
None # if str, pass in as os.environ/
)
stream_timeout: Optional[Union[float, str]] = (
None # timeout when making stream=True calls, if str, pass in as os.environ/
)
max_retries: Optional[int] = None
organization: Optional[str] = None # for openai orgs
configurable_clientside_auth_params: CONFIGURABLE_CLIENTSIDE_AUTH_PARAMS = None
## LOGGING PARAMS ##
litellm_trace_id: Optional[str] = None
## UNIFIED PROJECT/REGION ##
region_name: Optional[str] = None
## VERTEX AI ##
vertex_project: Optional[str] = None
vertex_location: Optional[str] = None
vertex_credentials: Optional[str] = None
## AWS BEDROCK / SAGEMAKER ##
aws_access_key_id: Optional[str] = None
aws_secret_access_key: Optional[str] = None
aws_region_name: Optional[str] = None
## IBM WATSONX ##
watsonx_region_name: Optional[str] = None
## CUSTOM PRICING ##
input_cost_per_token: Optional[float] = None
output_cost_per_token: Optional[float] = None
input_cost_per_second: Optional[float] = None
output_cost_per_second: Optional[float] = None
max_file_size_mb: Optional[float] = None
# Deployment budgets
max_budget: Optional[float] = None
budget_duration: Optional[str] = None
use_in_pass_through: Optional[bool] = False
model_config = ConfigDict(extra="allow", arbitrary_types_allowed=True)
def __init__(
self,
custom_llm_provider: Optional[str] = None,
max_retries: Optional[Union[int, str]] = None,
tpm: Optional[int] = None,
rpm: Optional[int] = None,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
api_version: Optional[str] = None,
timeout: Optional[Union[float, str]] = None, # if str, pass in as os.environ/
stream_timeout: Optional[Union[float, str]] = (
None # timeout when making stream=True calls, if str, pass in as os.environ/
),
organization: Optional[str] = None, # for openai orgs
## LOGGING PARAMS ##
litellm_trace_id: Optional[str] = None,
## UNIFIED PROJECT/REGION ##
region_name: Optional[str] = None,
## VERTEX AI ##
vertex_project: Optional[str] = None,
vertex_location: Optional[str] = None,
vertex_credentials: Optional[str] = None,
## AWS BEDROCK / SAGEMAKER ##
aws_access_key_id: Optional[str] = None,
aws_secret_access_key: Optional[str] = None,
aws_region_name: Optional[str] = None,
## IBM WATSONX ##
watsonx_region_name: Optional[str] = None,
input_cost_per_token: Optional[float] = None,
output_cost_per_token: Optional[float] = None,
input_cost_per_second: Optional[float] = None,
output_cost_per_second: Optional[float] = None,
max_file_size_mb: Optional[float] = None,
# Deployment budgets
max_budget: Optional[float] = None,
budget_duration: Optional[str] = None,
# Pass through params
use_in_pass_through: Optional[bool] = False,
**params,
):
args = locals()
args.pop("max_retries", None)
args.pop("self", None)
args.pop("params", None)
args.pop("__class__", None)
if max_retries is not None and isinstance(max_retries, str):
max_retries = int(max_retries) # cast to int
super().__init__(max_retries=max_retries, **args, **params)
def __contains__(self, key):
# Define custom behavior for the 'in' operator
return hasattr(self, key)
def get(self, key, default=None):
# Custom .get() method to access attributes with a default value if the attribute doesn't exist
return getattr(self, key, default)
def __getitem__(self, key):
# Allow dictionary-style access to attributes
return getattr(self, key)
def __setitem__(self, key, value):
# Allow dictionary-style assignment of attributes
setattr(self, key, value)
class LiteLLM_Params(GenericLiteLLMParams):
"""
LiteLLM Params with 'model' requirement - used for completions
"""
model: str
model_config = ConfigDict(extra="allow", arbitrary_types_allowed=True)
def __init__(
self,
model: str,
custom_llm_provider: Optional[str] = None,
max_retries: Optional[Union[int, str]] = None,
tpm: Optional[int] = None,
rpm: Optional[int] = None,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
api_version: Optional[str] = None,
timeout: Optional[Union[float, str]] = None, # if str, pass in as os.environ/
stream_timeout: Optional[Union[float, str]] = (
None # timeout when making stream=True calls, if str, pass in as os.environ/
),
organization: Optional[str] = None, # for openai orgs
## VERTEX AI ##
vertex_project: Optional[str] = None,
vertex_location: Optional[str] = None,
## AWS BEDROCK / SAGEMAKER ##
aws_access_key_id: Optional[str] = None,
aws_secret_access_key: Optional[str] = None,
aws_region_name: Optional[str] = None,
# OpenAI / Azure Whisper
# set a max-size of file that can be passed to litellm proxy
max_file_size_mb: Optional[float] = None,
# will use deployment on pass-through endpoints if True
use_in_pass_through: Optional[bool] = False,
**params,
):
args = locals()
args.pop("max_retries", None)
args.pop("self", None)
args.pop("params", None)
args.pop("__class__", None)
if max_retries is not None and isinstance(max_retries, str):
max_retries = int(max_retries) # cast to int
super().__init__(max_retries=max_retries, **args, **params)
def __contains__(self, key):
# Define custom behavior for the 'in' operator
return hasattr(self, key)
def get(self, key, default=None):
# Custom .get() method to access attributes with a default value if the attribute doesn't exist
return getattr(self, key, default)
def __getitem__(self, key):
# Allow dictionary-style access to attributes
return getattr(self, key)
def __setitem__(self, key, value):
# Allow dictionary-style assignment of attributes
setattr(self, key, value)
class updateLiteLLMParams(GenericLiteLLMParams):
# This class is used to update the LiteLLM_Params
# only differece is model is optional
model: Optional[str] = None
class updateDeployment(BaseModel):
model_name: Optional[str] = None
litellm_params: Optional[updateLiteLLMParams] = None
model_info: Optional[ModelInfo] = None
model_config = ConfigDict(protected_namespaces=())
class LiteLLMParamsTypedDict(TypedDict, total=False):
model: str
custom_llm_provider: Optional[str]
tpm: Optional[int]
rpm: Optional[int]
order: Optional[int]
weight: Optional[int]
max_parallel_requests: Optional[int]
api_key: Optional[str]
api_base: Optional[str]
api_version: Optional[str]
timeout: Optional[Union[float, str, httpx.Timeout]]
stream_timeout: Optional[Union[float, str]]
max_retries: Optional[int]
organization: Optional[Union[List, str]] # for openai orgs
configurable_clientside_auth_params: CONFIGURABLE_CLIENTSIDE_AUTH_PARAMS # for allowing api base switching on finetuned models
## DROP PARAMS ##
drop_params: Optional[bool]
## UNIFIED PROJECT/REGION ##
region_name: Optional[str]
## VERTEX AI ##
vertex_project: Optional[str]
vertex_location: Optional[str]
## AWS BEDROCK / SAGEMAKER ##
aws_access_key_id: Optional[str]
aws_secret_access_key: Optional[str]
aws_region_name: Optional[str]
## IBM WATSONX ##
watsonx_region_name: Optional[str]
## CUSTOM PRICING ##
input_cost_per_token: Optional[float]
output_cost_per_token: Optional[float]
input_cost_per_second: Optional[float]
output_cost_per_second: Optional[float]
num_retries: Optional[int]
## MOCK RESPONSES ##
mock_response: Optional[Union[str, ModelResponse, Exception]]
# routing params
# use this for tag-based routing
tags: Optional[List[str]]
# deployment budgets
max_budget: Optional[float]
budget_duration: Optional[str]
class DeploymentTypedDict(TypedDict, total=False):
model_name: Required[str]
litellm_params: Required[LiteLLMParamsTypedDict]
model_info: dict
SPECIAL_MODEL_INFO_PARAMS = [
"input_cost_per_token",
"output_cost_per_token",
"input_cost_per_character",
"output_cost_per_character",
]
class Deployment(BaseModel):
model_name: str
litellm_params: LiteLLM_Params
model_info: ModelInfo
model_config = ConfigDict(extra="allow", protected_namespaces=())
def __init__(
self,
model_name: str,
litellm_params: LiteLLM_Params,
model_info: Optional[Union[ModelInfo, dict]] = None,
**params,
):
if model_info is None:
model_info = ModelInfo()
elif isinstance(model_info, dict):
model_info = ModelInfo(**model_info)
for (
key
) in (
SPECIAL_MODEL_INFO_PARAMS
): # ensures custom pricing info is consistently in 'model_info'
field = getattr(litellm_params, key, None)
if field is not None:
setattr(model_info, key, field)
super().__init__(
model_info=model_info,
model_name=model_name,
litellm_params=litellm_params,
**params,
)
def to_json(self, **kwargs):
try:
return self.model_dump(**kwargs) # noqa
except Exception as e:
# if using pydantic v1
return self.dict(**kwargs)
def __contains__(self, key):
# Define custom behavior for the 'in' operator
return hasattr(self, key)
def get(self, key, default=None):
# Custom .get() method to access attributes with a default value if the attribute doesn't exist
return getattr(self, key, default)
def __getitem__(self, key):
# Allow dictionary-style access to attributes
return getattr(self, key)
def __setitem__(self, key, value):
# Allow dictionary-style assignment of attributes
setattr(self, key, value)
class RouterErrors(enum.Enum):
"""
Enum for router specific errors with common codes
"""
user_defined_ratelimit_error = "Deployment over user-defined ratelimit."
no_deployments_available = "No deployments available for selected model"
no_deployments_with_tag_routing = (
"Not allowed to access model due to tags configuration"
)
no_deployments_with_provider_budget_routing = (
"No deployments available - crossed budget"
)
class AllowedFailsPolicy(BaseModel):
"""
Use this to set a custom number of allowed fails/minute before cooling down a deployment
If `AuthenticationErrorAllowedFails = 1000`, then 1000 AuthenticationError will be allowed before cooling down a deployment
Mapping of Exception type to allowed_fails for each exception
https://docs.litellm.ai/docs/exception_mapping
"""
BadRequestErrorAllowedFails: Optional[int] = None
AuthenticationErrorAllowedFails: Optional[int] = None
TimeoutErrorAllowedFails: Optional[int] = None
RateLimitErrorAllowedFails: Optional[int] = None
ContentPolicyViolationErrorAllowedFails: Optional[int] = None
InternalServerErrorAllowedFails: Optional[int] = None
class RetryPolicy(BaseModel):
"""
Use this to set a custom number of retries per exception type
If RateLimitErrorRetries = 3, then 3 retries will be made for RateLimitError
Mapping of Exception type to number of retries
https://docs.litellm.ai/docs/exception_mapping
"""
BadRequestErrorRetries: Optional[int] = None
AuthenticationErrorRetries: Optional[int] = None
TimeoutErrorRetries: Optional[int] = None
RateLimitErrorRetries: Optional[int] = None
ContentPolicyViolationErrorRetries: Optional[int] = None
InternalServerErrorRetries: Optional[int] = None
class AlertingConfig(BaseModel):
"""
Use this configure alerting for the router. Receive alerts on the following events
- LLM API Exceptions
- LLM Responses Too Slow
- LLM Requests Hanging
Args:
webhook_url: str - webhook url for alerting, slack provides a webhook url to send alerts to
alerting_threshold: Optional[float] = None - threshold for slow / hanging llm responses (in seconds)
"""
webhook_url: str
alerting_threshold: Optional[float] = 300
class ModelGroupInfo(BaseModel):
model_group: str
providers: List[str]
max_input_tokens: Optional[float] = None
max_output_tokens: Optional[float] = None
input_cost_per_token: Optional[float] = None
output_cost_per_token: Optional[float] = None
mode: Optional[
Union[
str,
Literal[
"chat",
"embedding",
"completion",
"image_generation",
"audio_transcription",
"rerank",
"moderations",
],
]
] = Field(default="chat")
tpm: Optional[int] = None
rpm: Optional[int] = None
supports_parallel_function_calling: bool = Field(default=False)
supports_vision: bool = Field(default=False)
supports_function_calling: bool = Field(default=False)
supported_openai_params: Optional[List[str]] = Field(default=[])
configurable_clientside_auth_params: CONFIGURABLE_CLIENTSIDE_AUTH_PARAMS = None
def __init__(self, **data):
for field_name, field_type in get_type_hints(self.__class__).items():
if field_type == bool and data.get(field_name) is None:
data[field_name] = False
super().__init__(**data)
class AssistantsTypedDict(TypedDict):
custom_llm_provider: Literal["azure", "openai"]
litellm_params: LiteLLMParamsTypedDict
class FineTuningConfig(BaseModel):
custom_llm_provider: Literal["azure", "openai"]
class CustomRoutingStrategyBase:
async def async_get_available_deployment(
self,
model: str,
messages: Optional[List[Dict[str, str]]] = None,
input: Optional[Union[str, List]] = None,
specific_deployment: Optional[bool] = False,
request_kwargs: Optional[Dict] = None,
):
"""
Asynchronously retrieves the available deployment based on the given parameters.
Args:
model (str): The name of the model.
messages (Optional[List[Dict[str, str]]], optional): The list of messages for a given request. Defaults to None.
input (Optional[Union[str, List]], optional): The input for a given embedding request. Defaults to None.
specific_deployment (Optional[bool], optional): Whether to retrieve a specific deployment. Defaults to False.
request_kwargs (Optional[Dict], optional): Additional request keyword arguments. Defaults to None.
Returns:
Returns an element from litellm.router.model_list
"""
pass
def get_available_deployment(
self,
model: str,
messages: Optional[List[Dict[str, str]]] = None,
input: Optional[Union[str, List]] = None,
specific_deployment: Optional[bool] = False,
request_kwargs: Optional[Dict] = None,
):
"""
Synchronously retrieves the available deployment based on the given parameters.
Args:
model (str): The name of the model.
messages (Optional[List[Dict[str, str]]], optional): The list of messages for a given request. Defaults to None.
input (Optional[Union[str, List]], optional): The input for a given embedding request. Defaults to None.
specific_deployment (Optional[bool], optional): Whether to retrieve a specific deployment. Defaults to False.
request_kwargs (Optional[Dict], optional): Additional request keyword arguments. Defaults to None.
Returns:
Returns an element from litellm.router.model_list
"""
pass
class RouterGeneralSettings(BaseModel):
async_only_mode: bool = Field(
default=False
) # this will only initialize async clients. Good for memory utils
pass_through_all_models: bool = Field(
default=False
) # if passed a model not llm_router model list, pass through the request to litellm.acompletion/embedding
class RouterRateLimitErrorBasic(ValueError):
"""
Raise a basic error inside helper functions.
"""
def __init__(
self,
model: str,
):
self.model = model
_message = f"{RouterErrors.no_deployments_available.value}."
super().__init__(_message)
class RouterRateLimitError(ValueError):
def __init__(
self,
model: str,
cooldown_time: float,
enable_pre_call_checks: bool,
cooldown_list: List,
):
self.model = model
self.cooldown_time = cooldown_time
self.enable_pre_call_checks = enable_pre_call_checks
self.cooldown_list = cooldown_list
_message = f"{RouterErrors.no_deployments_available.value}, Try again in {cooldown_time} seconds. Passed model={model}. pre-call-checks={enable_pre_call_checks}, cooldown_list={cooldown_list}"
super().__init__(_message)
class RouterModelGroupAliasItem(TypedDict):
model: str
hidden: bool # if 'True', don't return on `.get_model_list`
VALID_LITELLM_ENVIRONMENTS = [
"development",
"staging",
"production",
]
class RoutingStrategy(enum.Enum):
LEAST_BUSY = "least-busy"
LATENCY_BASED = "latency-based-routing"
COST_BASED = "cost-based-routing"
USAGE_BASED_ROUTING_V2 = "usage-based-routing-v2"
USAGE_BASED_ROUTING = "usage-based-routing"
PROVIDER_BUDGET_LIMITING = "provider-budget-routing"
class RouterCacheEnum(enum.Enum):
TPM = "global_router:{id}:{model}:tpm:{current_minute}"
RPM = "global_router:{id}:{model}:rpm:{current_minute}"
class GenericBudgetWindowDetails(BaseModel):
"""Details about a provider's budget window"""
budget_start: float
spend_key: str
start_time_key: str
ttl_seconds: int
OptionalPreCallChecks = List[Literal["prompt_caching", "router_budget_limiting"]]