TestLLM / litellm /integrations /prometheus_services.py
Raju2024's picture
Upload 1072 files
e3278e4 verified
raw
history blame
8.56 kB
# used for monitoring litellm services health on `/metrics` endpoint on LiteLLM Proxy
#### What this does ####
# On success + failure, log events to Prometheus for litellm / adjacent services (litellm, redis, postgres, llm api providers)
from typing import List, Optional, Union
from litellm._logging import print_verbose, verbose_logger
from litellm.types.integrations.prometheus import LATENCY_BUCKETS
from litellm.types.services import ServiceLoggerPayload, ServiceTypes
FAILED_REQUESTS_LABELS = ["error_class", "function_name"]
class PrometheusServicesLogger:
# Class variables or attributes
litellm_service_latency = None # Class-level attribute to store the Histogram
def __init__(
self,
mock_testing: bool = False,
**kwargs,
):
try:
try:
from prometheus_client import REGISTRY, Counter, Histogram
except ImportError:
raise Exception(
"Missing prometheus_client. Run `pip install prometheus-client`"
)
self.Histogram = Histogram
self.Counter = Counter
self.REGISTRY = REGISTRY
verbose_logger.debug("in init prometheus services metrics")
self.services = [item.value for item in ServiceTypes]
self.payload_to_prometheus_map = (
{}
) # store the prometheus histogram/counter we need to call for each field in payload
for service in self.services:
histogram = self.create_histogram(service, type_of_request="latency")
counter_failed_request = self.create_counter(
service,
type_of_request="failed_requests",
additional_labels=FAILED_REQUESTS_LABELS,
)
counter_total_requests = self.create_counter(
service, type_of_request="total_requests"
)
self.payload_to_prometheus_map[service] = [
histogram,
counter_failed_request,
counter_total_requests,
]
self.prometheus_to_amount_map: dict = (
{}
) # the field / value in ServiceLoggerPayload the object needs to be incremented by
### MOCK TESTING ###
self.mock_testing = mock_testing
self.mock_testing_success_calls = 0
self.mock_testing_failure_calls = 0
except Exception as e:
print_verbose(f"Got exception on init prometheus client {str(e)}")
raise e
def is_metric_registered(self, metric_name) -> bool:
for metric in self.REGISTRY.collect():
if metric_name == metric.name:
return True
return False
def _get_metric(self, metric_name):
"""
Helper function to get a metric from the registry by name.
"""
return self.REGISTRY._names_to_collectors.get(metric_name)
def create_histogram(self, service: str, type_of_request: str):
metric_name = "litellm_{}_{}".format(service, type_of_request)
is_registered = self.is_metric_registered(metric_name)
if is_registered:
return self._get_metric(metric_name)
return self.Histogram(
metric_name,
"Latency for {} service".format(service),
labelnames=[service],
buckets=LATENCY_BUCKETS,
)
def create_counter(
self,
service: str,
type_of_request: str,
additional_labels: Optional[List[str]] = None,
):
metric_name = "litellm_{}_{}".format(service, type_of_request)
is_registered = self.is_metric_registered(metric_name)
if is_registered:
return self._get_metric(metric_name)
return self.Counter(
metric_name,
"Total {} for {} service".format(type_of_request, service),
labelnames=[service] + (additional_labels or []),
)
def observe_histogram(
self,
histogram,
labels: str,
amount: float,
):
assert isinstance(histogram, self.Histogram)
histogram.labels(labels).observe(amount)
def increment_counter(
self,
counter,
labels: str,
amount: float,
additional_labels: Optional[List[str]] = [],
):
assert isinstance(counter, self.Counter)
if additional_labels:
counter.labels(labels, *additional_labels).inc(amount)
else:
counter.labels(labels).inc(amount)
def service_success_hook(self, payload: ServiceLoggerPayload):
if self.mock_testing:
self.mock_testing_success_calls += 1
if payload.service.value in self.payload_to_prometheus_map:
prom_objects = self.payload_to_prometheus_map[payload.service.value]
for obj in prom_objects:
if isinstance(obj, self.Histogram):
self.observe_histogram(
histogram=obj,
labels=payload.service.value,
amount=payload.duration,
)
elif isinstance(obj, self.Counter) and "total_requests" in obj._name:
self.increment_counter(
counter=obj,
labels=payload.service.value,
amount=1, # LOG TOTAL REQUESTS TO PROMETHEUS
)
def service_failure_hook(self, payload: ServiceLoggerPayload):
if self.mock_testing:
self.mock_testing_failure_calls += 1
if payload.service.value in self.payload_to_prometheus_map:
prom_objects = self.payload_to_prometheus_map[payload.service.value]
for obj in prom_objects:
if isinstance(obj, self.Counter):
self.increment_counter(
counter=obj,
labels=payload.service.value,
amount=1, # LOG ERROR COUNT / TOTAL REQUESTS TO PROMETHEUS
)
async def async_service_success_hook(self, payload: ServiceLoggerPayload):
"""
Log successful call to prometheus
"""
if self.mock_testing:
self.mock_testing_success_calls += 1
if payload.service.value in self.payload_to_prometheus_map:
prom_objects = self.payload_to_prometheus_map[payload.service.value]
for obj in prom_objects:
if isinstance(obj, self.Histogram):
self.observe_histogram(
histogram=obj,
labels=payload.service.value,
amount=payload.duration,
)
elif isinstance(obj, self.Counter) and "total_requests" in obj._name:
self.increment_counter(
counter=obj,
labels=payload.service.value,
amount=1, # LOG TOTAL REQUESTS TO PROMETHEUS
)
async def async_service_failure_hook(
self,
payload: ServiceLoggerPayload,
error: Union[str, Exception],
):
if self.mock_testing:
self.mock_testing_failure_calls += 1
error_class = error.__class__.__name__
function_name = payload.call_type
if payload.service.value in self.payload_to_prometheus_map:
prom_objects = self.payload_to_prometheus_map[payload.service.value]
for obj in prom_objects:
# increment both failed and total requests
if isinstance(obj, self.Counter):
if "failed_requests" in obj._name:
self.increment_counter(
counter=obj,
labels=payload.service.value,
# log additional_labels=["error_class", "function_name"], used for debugging what's going wrong with the DB
additional_labels=[error_class, function_name],
amount=1, # LOG ERROR COUNT TO PROMETHEUS
)
else:
self.increment_counter(
counter=obj,
labels=payload.service.value,
amount=1, # LOG TOTAL REQUESTS TO PROMETHEUS
)