yetessam commited on
Commit
36334a9
·
verified ·
1 Parent(s): 5e8d288

Create status_check.py

Browse files
Files changed (1) hide show
  1. status_check.py +118 -0
status_check.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import requests
4
+
5
+ from typing import Optional, Tuple
6
+
7
+
8
+
9
+ from ui.statusui import StatusUI
10
+ from checks.health_check import check_model_endpoint
11
+ from agents.model import load_huggingface_model
12
+
13
+
14
+ def wake_up_endpoint(
15
+ endpoint_uri: str,
16
+ ui,
17
+ max_wait: int = 300,
18
+ initial_delay: float = 3.0,
19
+ backoff_factor: float = 1.5,
20
+ max_retry_delay: float = 10.0
21
+ ) -> Tuple[bool, Optional[str]]:
22
+ """
23
+ Poll the endpoint until it responds OK or timeout.
24
+
25
+ Args:
26
+ endpoint_uri: The endpoint URL to monitor
27
+ ui: UI object for status updates
28
+ max_wait: Maximum total wait time in seconds (minimum 60s enforced)
29
+ initial_delay: Initial delay between retries in seconds
30
+ backoff_factor: Multiplier for exponential backoff
31
+ max_retry_delay: Maximum delay between retries in seconds
32
+
33
+ Returns:
34
+ Tuple of (success: bool, error_message: Optional[str])
35
+ """
36
+ # Configuration validation
37
+ max_wait = max(max_wait, 60)
38
+ current_delay = min(initial_delay, max_retry_delay)
39
+
40
+ # Prepare request components
41
+ headers = {}
42
+ if hf_token := os.environ.get("HF_TOKEN"):
43
+ headers["Authorization"] = f"Bearer {hf_token}"
44
+
45
+ payload = {"inputs": "ping"}
46
+ timeout = min(5, current_delay * 0.8) # Ensure timeout is less than delay
47
+
48
+ start_time = time.time()
49
+ last_status = None
50
+
51
+ while (time.time() - start_time) < max_wait:
52
+ try:
53
+ # Log attempt
54
+ if endpoint_uri != last_status:
55
+ ui.append(f"Pinging endpoint: {endpoint_uri}")
56
+ last_status = endpoint_uri
57
+
58
+ # Make the request
59
+ response = requests.post(
60
+ endpoint_uri,
61
+ headers=headers,
62
+ json=payload,
63
+ timeout=timeout
64
+ )
65
+
66
+ if response.ok:
67
+ ui.append("✅ Endpoint is awake and responsive")
68
+ return True, None
69
+
70
+ # Handle specific HTTP status codes
71
+ if response.status_code in {503, 504}:
72
+ status_msg = f"Endpoint warming up (HTTP {response.status_code})"
73
+ else:
74
+ status_msg = f"Unexpected response (HTTP {response.status_code})"
75
+
76
+ ui.append(f"{status_msg}, retrying in {current_delay:.1f}s...")
77
+
78
+ except requests.exceptions.RequestException as e:
79
+ ui.append(f"Connection error ({type(e).__name__}), retrying in {current_delay:.1f}s...")
80
+
81
+ # Wait before next attempt with exponential backoff
82
+ time.sleep(current_delay)
83
+ current_delay = min(current_delay * backoff_factor, max_retry_delay)
84
+ timeout = min(5, current_delay * 0.8)
85
+
86
+ # Timeout reached
87
+ error_msg = f"❌ Timed out after {max_wait}s waiting for endpoint"
88
+ ui.append(error_msg)
89
+ return False, error_msg
90
+
91
+ def run_status_checks():
92
+ """Run all status checks and return endpoint URI if successful"""
93
+ ui = StatusUI("Content Agent Status Checks")
94
+ ui.launch()
95
+
96
+ ui.append("Starting prechecks...")
97
+ ui.append("Checking endpoint..")
98
+
99
+ endpoint_uri = load_huggingface_model() # Get the URI for the endpoint
100
+ ui.append(endpoint_uri)
101
+
102
+ # Wake it up before health check
103
+ wake_up_successful = wake_up_endpoint(endpoint_uri, ui)
104
+
105
+ if not wake_up_successful:
106
+ ui.append("Warning: Could not wake up the endpoint. Continuing.")
107
+ else:
108
+ ui.append("✅ End point responded OK.")
109
+
110
+ is_healthy, status_info = check_model_endpoint(endpoint_uri) # Test the endpoint
111
+
112
+ if not is_healthy:
113
+ from checks.failed_check import create_failed_gradio_ui
114
+ interface = create_failed_gradio_ui(status_info)
115
+ interface.launch(show_error=True, share=True)
116
+ return None
117
+
118
+ return endpoint_uri