|
""" |
|
litellm.Router Types - includes RouterConfig, UpdateRouterConfig, ModelInfo etc |
|
""" |
|
|
|
import datetime |
|
import enum |
|
import uuid |
|
from typing import Any, Dict, List, Literal, Optional, Tuple, Union, get_type_hints |
|
|
|
import httpx |
|
from pydantic import BaseModel, ConfigDict, Field |
|
from typing_extensions import Required, TypedDict |
|
|
|
from ..exceptions import RateLimitError |
|
from .completion import CompletionRequest |
|
from .embedding import EmbeddingRequest |
|
from .utils import ModelResponse, ProviderSpecificModelInfo |
|
|
|
|
|
class ConfigurableClientsideParamsCustomAuth(TypedDict): |
|
api_base: str |
|
|
|
|
|
CONFIGURABLE_CLIENTSIDE_AUTH_PARAMS = Optional[ |
|
List[Union[str, ConfigurableClientsideParamsCustomAuth]] |
|
] |
|
|
|
|
|
class ModelConfig(BaseModel): |
|
model_name: str |
|
litellm_params: Union[CompletionRequest, EmbeddingRequest] |
|
tpm: int |
|
rpm: int |
|
|
|
model_config = ConfigDict(protected_namespaces=()) |
|
|
|
|
|
class RouterConfig(BaseModel): |
|
model_list: List[ModelConfig] |
|
|
|
redis_url: Optional[str] = None |
|
redis_host: Optional[str] = None |
|
redis_port: Optional[int] = None |
|
redis_password: Optional[str] = None |
|
|
|
cache_responses: Optional[bool] = False |
|
cache_kwargs: Optional[Dict] = {} |
|
caching_groups: Optional[List[Tuple[str, List[str]]]] = None |
|
client_ttl: Optional[int] = 3600 |
|
num_retries: Optional[int] = 0 |
|
timeout: Optional[float] = None |
|
default_litellm_params: Optional[Dict[str, str]] = {} |
|
set_verbose: Optional[bool] = False |
|
fallbacks: Optional[List] = [] |
|
allowed_fails: Optional[int] = None |
|
context_window_fallbacks: Optional[List] = [] |
|
model_group_alias: Optional[Dict[str, List[str]]] = {} |
|
retry_after: Optional[int] = 0 |
|
routing_strategy: Literal[ |
|
"simple-shuffle", |
|
"least-busy", |
|
"usage-based-routing", |
|
"latency-based-routing", |
|
] = "simple-shuffle" |
|
|
|
model_config = ConfigDict(protected_namespaces=()) |
|
|
|
|
|
class UpdateRouterConfig(BaseModel): |
|
""" |
|
Set of params that you can modify via `router.update_settings()`. |
|
""" |
|
|
|
routing_strategy_args: Optional[dict] = None |
|
routing_strategy: Optional[str] = None |
|
model_group_retry_policy: Optional[dict] = None |
|
allowed_fails: Optional[int] = None |
|
cooldown_time: Optional[float] = None |
|
num_retries: Optional[int] = None |
|
timeout: Optional[float] = None |
|
max_retries: Optional[int] = None |
|
retry_after: Optional[float] = None |
|
fallbacks: Optional[List[dict]] = None |
|
context_window_fallbacks: Optional[List[dict]] = None |
|
|
|
model_config = ConfigDict(protected_namespaces=()) |
|
|
|
|
|
class ModelInfo(BaseModel): |
|
id: Optional[ |
|
str |
|
] |
|
db_model: bool = ( |
|
False |
|
) |
|
updated_at: Optional[datetime.datetime] = None |
|
updated_by: Optional[str] = None |
|
|
|
created_at: Optional[datetime.datetime] = None |
|
created_by: Optional[str] = None |
|
|
|
base_model: Optional[str] = ( |
|
None |
|
) |
|
tier: Optional[Literal["free", "paid"]] = None |
|
team_id: Optional[str] = None |
|
|
|
def __init__(self, id: Optional[Union[str, int]] = None, **params): |
|
if id is None: |
|
id = str(uuid.uuid4()) |
|
elif isinstance(id, int): |
|
id = str(id) |
|
super().__init__(id=id, **params) |
|
|
|
model_config = ConfigDict(extra="allow") |
|
|
|
def __contains__(self, key): |
|
|
|
return hasattr(self, key) |
|
|
|
def get(self, key, default=None): |
|
|
|
return getattr(self, key, default) |
|
|
|
def __getitem__(self, key): |
|
|
|
return getattr(self, key) |
|
|
|
def __setitem__(self, key, value): |
|
|
|
setattr(self, key, value) |
|
|
|
|
|
class GenericLiteLLMParams(BaseModel): |
|
""" |
|
LiteLLM Params without 'model' arg (used across completion / assistants api) |
|
""" |
|
|
|
custom_llm_provider: Optional[str] = None |
|
tpm: Optional[int] = None |
|
rpm: Optional[int] = None |
|
api_key: Optional[str] = None |
|
api_base: Optional[str] = None |
|
api_version: Optional[str] = None |
|
timeout: Optional[Union[float, str, httpx.Timeout]] = ( |
|
None |
|
) |
|
stream_timeout: Optional[Union[float, str]] = ( |
|
None |
|
) |
|
max_retries: Optional[int] = None |
|
organization: Optional[str] = None |
|
configurable_clientside_auth_params: CONFIGURABLE_CLIENTSIDE_AUTH_PARAMS = None |
|
|
|
litellm_trace_id: Optional[str] = None |
|
|
|
region_name: Optional[str] = None |
|
|
|
vertex_project: Optional[str] = None |
|
vertex_location: Optional[str] = None |
|
vertex_credentials: Optional[str] = None |
|
|
|
aws_access_key_id: Optional[str] = None |
|
aws_secret_access_key: Optional[str] = None |
|
aws_region_name: Optional[str] = None |
|
|
|
watsonx_region_name: Optional[str] = None |
|
|
|
input_cost_per_token: Optional[float] = None |
|
output_cost_per_token: Optional[float] = None |
|
input_cost_per_second: Optional[float] = None |
|
output_cost_per_second: Optional[float] = None |
|
|
|
max_file_size_mb: Optional[float] = None |
|
|
|
|
|
max_budget: Optional[float] = None |
|
budget_duration: Optional[str] = None |
|
use_in_pass_through: Optional[bool] = False |
|
model_config = ConfigDict(extra="allow", arbitrary_types_allowed=True) |
|
|
|
def __init__( |
|
self, |
|
custom_llm_provider: Optional[str] = None, |
|
max_retries: Optional[Union[int, str]] = None, |
|
tpm: Optional[int] = None, |
|
rpm: Optional[int] = None, |
|
api_key: Optional[str] = None, |
|
api_base: Optional[str] = None, |
|
api_version: Optional[str] = None, |
|
timeout: Optional[Union[float, str]] = None, |
|
stream_timeout: Optional[Union[float, str]] = ( |
|
None |
|
), |
|
organization: Optional[str] = None, |
|
|
|
litellm_trace_id: Optional[str] = None, |
|
|
|
region_name: Optional[str] = None, |
|
|
|
vertex_project: Optional[str] = None, |
|
vertex_location: Optional[str] = None, |
|
vertex_credentials: Optional[str] = None, |
|
|
|
aws_access_key_id: Optional[str] = None, |
|
aws_secret_access_key: Optional[str] = None, |
|
aws_region_name: Optional[str] = None, |
|
|
|
watsonx_region_name: Optional[str] = None, |
|
input_cost_per_token: Optional[float] = None, |
|
output_cost_per_token: Optional[float] = None, |
|
input_cost_per_second: Optional[float] = None, |
|
output_cost_per_second: Optional[float] = None, |
|
max_file_size_mb: Optional[float] = None, |
|
|
|
max_budget: Optional[float] = None, |
|
budget_duration: Optional[str] = None, |
|
|
|
use_in_pass_through: Optional[bool] = False, |
|
**params, |
|
): |
|
args = locals() |
|
args.pop("max_retries", None) |
|
args.pop("self", None) |
|
args.pop("params", None) |
|
args.pop("__class__", None) |
|
if max_retries is not None and isinstance(max_retries, str): |
|
max_retries = int(max_retries) |
|
super().__init__(max_retries=max_retries, **args, **params) |
|
|
|
def __contains__(self, key): |
|
|
|
return hasattr(self, key) |
|
|
|
def get(self, key, default=None): |
|
|
|
return getattr(self, key, default) |
|
|
|
def __getitem__(self, key): |
|
|
|
return getattr(self, key) |
|
|
|
def __setitem__(self, key, value): |
|
|
|
setattr(self, key, value) |
|
|
|
|
|
class LiteLLM_Params(GenericLiteLLMParams): |
|
""" |
|
LiteLLM Params with 'model' requirement - used for completions |
|
""" |
|
|
|
model: str |
|
model_config = ConfigDict(extra="allow", arbitrary_types_allowed=True) |
|
|
|
def __init__( |
|
self, |
|
model: str, |
|
custom_llm_provider: Optional[str] = None, |
|
max_retries: Optional[Union[int, str]] = None, |
|
tpm: Optional[int] = None, |
|
rpm: Optional[int] = None, |
|
api_key: Optional[str] = None, |
|
api_base: Optional[str] = None, |
|
api_version: Optional[str] = None, |
|
timeout: Optional[Union[float, str]] = None, |
|
stream_timeout: Optional[Union[float, str]] = ( |
|
None |
|
), |
|
organization: Optional[str] = None, |
|
|
|
vertex_project: Optional[str] = None, |
|
vertex_location: Optional[str] = None, |
|
|
|
aws_access_key_id: Optional[str] = None, |
|
aws_secret_access_key: Optional[str] = None, |
|
aws_region_name: Optional[str] = None, |
|
|
|
|
|
max_file_size_mb: Optional[float] = None, |
|
|
|
use_in_pass_through: Optional[bool] = False, |
|
**params, |
|
): |
|
args = locals() |
|
args.pop("max_retries", None) |
|
args.pop("self", None) |
|
args.pop("params", None) |
|
args.pop("__class__", None) |
|
if max_retries is not None and isinstance(max_retries, str): |
|
max_retries = int(max_retries) |
|
super().__init__(max_retries=max_retries, **args, **params) |
|
|
|
def __contains__(self, key): |
|
|
|
return hasattr(self, key) |
|
|
|
def get(self, key, default=None): |
|
|
|
return getattr(self, key, default) |
|
|
|
def __getitem__(self, key): |
|
|
|
return getattr(self, key) |
|
|
|
def __setitem__(self, key, value): |
|
|
|
setattr(self, key, value) |
|
|
|
|
|
class updateLiteLLMParams(GenericLiteLLMParams): |
|
|
|
|
|
model: Optional[str] = None |
|
|
|
|
|
class updateDeployment(BaseModel): |
|
model_name: Optional[str] = None |
|
litellm_params: Optional[updateLiteLLMParams] = None |
|
model_info: Optional[ModelInfo] = None |
|
|
|
model_config = ConfigDict(protected_namespaces=()) |
|
|
|
|
|
class LiteLLMParamsTypedDict(TypedDict, total=False): |
|
model: str |
|
custom_llm_provider: Optional[str] |
|
tpm: Optional[int] |
|
rpm: Optional[int] |
|
order: Optional[int] |
|
weight: Optional[int] |
|
max_parallel_requests: Optional[int] |
|
api_key: Optional[str] |
|
api_base: Optional[str] |
|
api_version: Optional[str] |
|
timeout: Optional[Union[float, str, httpx.Timeout]] |
|
stream_timeout: Optional[Union[float, str]] |
|
max_retries: Optional[int] |
|
organization: Optional[Union[List, str]] |
|
configurable_clientside_auth_params: CONFIGURABLE_CLIENTSIDE_AUTH_PARAMS |
|
|
|
drop_params: Optional[bool] |
|
|
|
region_name: Optional[str] |
|
|
|
vertex_project: Optional[str] |
|
vertex_location: Optional[str] |
|
|
|
aws_access_key_id: Optional[str] |
|
aws_secret_access_key: Optional[str] |
|
aws_region_name: Optional[str] |
|
|
|
watsonx_region_name: Optional[str] |
|
|
|
input_cost_per_token: Optional[float] |
|
output_cost_per_token: Optional[float] |
|
input_cost_per_second: Optional[float] |
|
output_cost_per_second: Optional[float] |
|
num_retries: Optional[int] |
|
|
|
mock_response: Optional[Union[str, ModelResponse, Exception]] |
|
|
|
|
|
|
|
tags: Optional[List[str]] |
|
|
|
|
|
max_budget: Optional[float] |
|
budget_duration: Optional[str] |
|
|
|
|
|
class DeploymentTypedDict(TypedDict, total=False): |
|
model_name: Required[str] |
|
litellm_params: Required[LiteLLMParamsTypedDict] |
|
model_info: dict |
|
|
|
|
|
SPECIAL_MODEL_INFO_PARAMS = [ |
|
"input_cost_per_token", |
|
"output_cost_per_token", |
|
"input_cost_per_character", |
|
"output_cost_per_character", |
|
] |
|
|
|
|
|
class Deployment(BaseModel): |
|
model_name: str |
|
litellm_params: LiteLLM_Params |
|
model_info: ModelInfo |
|
|
|
model_config = ConfigDict(extra="allow", protected_namespaces=()) |
|
|
|
def __init__( |
|
self, |
|
model_name: str, |
|
litellm_params: LiteLLM_Params, |
|
model_info: Optional[Union[ModelInfo, dict]] = None, |
|
**params, |
|
): |
|
if model_info is None: |
|
model_info = ModelInfo() |
|
elif isinstance(model_info, dict): |
|
model_info = ModelInfo(**model_info) |
|
|
|
for ( |
|
key |
|
) in ( |
|
SPECIAL_MODEL_INFO_PARAMS |
|
): |
|
field = getattr(litellm_params, key, None) |
|
if field is not None: |
|
setattr(model_info, key, field) |
|
|
|
super().__init__( |
|
model_info=model_info, |
|
model_name=model_name, |
|
litellm_params=litellm_params, |
|
**params, |
|
) |
|
|
|
def to_json(self, **kwargs): |
|
try: |
|
return self.model_dump(**kwargs) |
|
except Exception as e: |
|
|
|
return self.dict(**kwargs) |
|
|
|
def __contains__(self, key): |
|
|
|
return hasattr(self, key) |
|
|
|
def get(self, key, default=None): |
|
|
|
return getattr(self, key, default) |
|
|
|
def __getitem__(self, key): |
|
|
|
return getattr(self, key) |
|
|
|
def __setitem__(self, key, value): |
|
|
|
setattr(self, key, value) |
|
|
|
|
|
class RouterErrors(enum.Enum): |
|
""" |
|
Enum for router specific errors with common codes |
|
""" |
|
|
|
user_defined_ratelimit_error = "Deployment over user-defined ratelimit." |
|
no_deployments_available = "No deployments available for selected model" |
|
no_deployments_with_tag_routing = ( |
|
"Not allowed to access model due to tags configuration" |
|
) |
|
no_deployments_with_provider_budget_routing = ( |
|
"No deployments available - crossed budget" |
|
) |
|
|
|
|
|
class AllowedFailsPolicy(BaseModel): |
|
""" |
|
Use this to set a custom number of allowed fails/minute before cooling down a deployment |
|
If `AuthenticationErrorAllowedFails = 1000`, then 1000 AuthenticationError will be allowed before cooling down a deployment |
|
|
|
Mapping of Exception type to allowed_fails for each exception |
|
https://docs.litellm.ai/docs/exception_mapping |
|
""" |
|
|
|
BadRequestErrorAllowedFails: Optional[int] = None |
|
AuthenticationErrorAllowedFails: Optional[int] = None |
|
TimeoutErrorAllowedFails: Optional[int] = None |
|
RateLimitErrorAllowedFails: Optional[int] = None |
|
ContentPolicyViolationErrorAllowedFails: Optional[int] = None |
|
InternalServerErrorAllowedFails: Optional[int] = None |
|
|
|
|
|
class RetryPolicy(BaseModel): |
|
""" |
|
Use this to set a custom number of retries per exception type |
|
If RateLimitErrorRetries = 3, then 3 retries will be made for RateLimitError |
|
Mapping of Exception type to number of retries |
|
https://docs.litellm.ai/docs/exception_mapping |
|
""" |
|
|
|
BadRequestErrorRetries: Optional[int] = None |
|
AuthenticationErrorRetries: Optional[int] = None |
|
TimeoutErrorRetries: Optional[int] = None |
|
RateLimitErrorRetries: Optional[int] = None |
|
ContentPolicyViolationErrorRetries: Optional[int] = None |
|
InternalServerErrorRetries: Optional[int] = None |
|
|
|
|
|
class AlertingConfig(BaseModel): |
|
""" |
|
Use this configure alerting for the router. Receive alerts on the following events |
|
- LLM API Exceptions |
|
- LLM Responses Too Slow |
|
- LLM Requests Hanging |
|
|
|
Args: |
|
webhook_url: str - webhook url for alerting, slack provides a webhook url to send alerts to |
|
alerting_threshold: Optional[float] = None - threshold for slow / hanging llm responses (in seconds) |
|
""" |
|
|
|
webhook_url: str |
|
alerting_threshold: Optional[float] = 300 |
|
|
|
|
|
class ModelGroupInfo(BaseModel): |
|
model_group: str |
|
providers: List[str] |
|
max_input_tokens: Optional[float] = None |
|
max_output_tokens: Optional[float] = None |
|
input_cost_per_token: Optional[float] = None |
|
output_cost_per_token: Optional[float] = None |
|
mode: Optional[ |
|
Union[ |
|
str, |
|
Literal[ |
|
"chat", |
|
"embedding", |
|
"completion", |
|
"image_generation", |
|
"audio_transcription", |
|
"rerank", |
|
"moderations", |
|
], |
|
] |
|
] = Field(default="chat") |
|
tpm: Optional[int] = None |
|
rpm: Optional[int] = None |
|
supports_parallel_function_calling: bool = Field(default=False) |
|
supports_vision: bool = Field(default=False) |
|
supports_function_calling: bool = Field(default=False) |
|
supported_openai_params: Optional[List[str]] = Field(default=[]) |
|
configurable_clientside_auth_params: CONFIGURABLE_CLIENTSIDE_AUTH_PARAMS = None |
|
|
|
def __init__(self, **data): |
|
for field_name, field_type in get_type_hints(self.__class__).items(): |
|
if field_type == bool and data.get(field_name) is None: |
|
data[field_name] = False |
|
super().__init__(**data) |
|
|
|
|
|
class AssistantsTypedDict(TypedDict): |
|
custom_llm_provider: Literal["azure", "openai"] |
|
litellm_params: LiteLLMParamsTypedDict |
|
|
|
|
|
class FineTuningConfig(BaseModel): |
|
|
|
custom_llm_provider: Literal["azure", "openai"] |
|
|
|
|
|
class CustomRoutingStrategyBase: |
|
async def async_get_available_deployment( |
|
self, |
|
model: str, |
|
messages: Optional[List[Dict[str, str]]] = None, |
|
input: Optional[Union[str, List]] = None, |
|
specific_deployment: Optional[bool] = False, |
|
request_kwargs: Optional[Dict] = None, |
|
): |
|
""" |
|
Asynchronously retrieves the available deployment based on the given parameters. |
|
|
|
Args: |
|
model (str): The name of the model. |
|
messages (Optional[List[Dict[str, str]]], optional): The list of messages for a given request. Defaults to None. |
|
input (Optional[Union[str, List]], optional): The input for a given embedding request. Defaults to None. |
|
specific_deployment (Optional[bool], optional): Whether to retrieve a specific deployment. Defaults to False. |
|
request_kwargs (Optional[Dict], optional): Additional request keyword arguments. Defaults to None. |
|
|
|
Returns: |
|
Returns an element from litellm.router.model_list |
|
|
|
""" |
|
pass |
|
|
|
def get_available_deployment( |
|
self, |
|
model: str, |
|
messages: Optional[List[Dict[str, str]]] = None, |
|
input: Optional[Union[str, List]] = None, |
|
specific_deployment: Optional[bool] = False, |
|
request_kwargs: Optional[Dict] = None, |
|
): |
|
""" |
|
Synchronously retrieves the available deployment based on the given parameters. |
|
|
|
Args: |
|
model (str): The name of the model. |
|
messages (Optional[List[Dict[str, str]]], optional): The list of messages for a given request. Defaults to None. |
|
input (Optional[Union[str, List]], optional): The input for a given embedding request. Defaults to None. |
|
specific_deployment (Optional[bool], optional): Whether to retrieve a specific deployment. Defaults to False. |
|
request_kwargs (Optional[Dict], optional): Additional request keyword arguments. Defaults to None. |
|
|
|
Returns: |
|
Returns an element from litellm.router.model_list |
|
|
|
""" |
|
pass |
|
|
|
|
|
class RouterGeneralSettings(BaseModel): |
|
async_only_mode: bool = Field( |
|
default=False |
|
) |
|
pass_through_all_models: bool = Field( |
|
default=False |
|
) |
|
|
|
|
|
class RouterRateLimitErrorBasic(ValueError): |
|
""" |
|
Raise a basic error inside helper functions. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
model: str, |
|
): |
|
self.model = model |
|
_message = f"{RouterErrors.no_deployments_available.value}." |
|
super().__init__(_message) |
|
|
|
|
|
class RouterRateLimitError(ValueError): |
|
def __init__( |
|
self, |
|
model: str, |
|
cooldown_time: float, |
|
enable_pre_call_checks: bool, |
|
cooldown_list: List, |
|
): |
|
self.model = model |
|
self.cooldown_time = cooldown_time |
|
self.enable_pre_call_checks = enable_pre_call_checks |
|
self.cooldown_list = cooldown_list |
|
_message = f"{RouterErrors.no_deployments_available.value}, Try again in {cooldown_time} seconds. Passed model={model}. pre-call-checks={enable_pre_call_checks}, cooldown_list={cooldown_list}" |
|
super().__init__(_message) |
|
|
|
|
|
class RouterModelGroupAliasItem(TypedDict): |
|
model: str |
|
hidden: bool |
|
|
|
|
|
VALID_LITELLM_ENVIRONMENTS = [ |
|
"development", |
|
"staging", |
|
"production", |
|
] |
|
|
|
|
|
class RoutingStrategy(enum.Enum): |
|
LEAST_BUSY = "least-busy" |
|
LATENCY_BASED = "latency-based-routing" |
|
COST_BASED = "cost-based-routing" |
|
USAGE_BASED_ROUTING_V2 = "usage-based-routing-v2" |
|
USAGE_BASED_ROUTING = "usage-based-routing" |
|
PROVIDER_BUDGET_LIMITING = "provider-budget-routing" |
|
|
|
|
|
class RouterCacheEnum(enum.Enum): |
|
TPM = "global_router:{id}:{model}:tpm:{current_minute}" |
|
RPM = "global_router:{id}:{model}:rpm:{current_minute}" |
|
|
|
|
|
class GenericBudgetWindowDetails(BaseModel): |
|
"""Details about a provider's budget window""" |
|
|
|
budget_start: float |
|
spend_key: str |
|
start_time_key: str |
|
ttl_seconds: int |
|
|
|
|
|
OptionalPreCallChecks = List[Literal["prompt_caching", "router_budget_limiting"]] |
|
|