Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
update evaluation progress
Browse files
backend/routes/evaluation.py
CHANGED
@@ -92,10 +92,15 @@ async def get_evaluation_logs(session_id: str):
|
|
92 |
if is_completed and hasattr(evaluation_task, 'results') and evaluation_task.results:
|
93 |
results = evaluation_task.results
|
94 |
|
|
|
|
|
|
|
95 |
return {
|
96 |
"logs": logs,
|
97 |
"is_completed": is_completed,
|
98 |
-
"results": results
|
|
|
|
|
99 |
}
|
100 |
|
101 |
@router.get("/evaluation-results/{session_id}")
|
|
|
92 |
if is_completed and hasattr(evaluation_task, 'results') and evaluation_task.results:
|
93 |
results = evaluation_task.results
|
94 |
|
95 |
+
# Récupérer l'information sur les étapes
|
96 |
+
progress = evaluation_task.get_progress()
|
97 |
+
|
98 |
return {
|
99 |
"logs": logs,
|
100 |
"is_completed": is_completed,
|
101 |
+
"results": results,
|
102 |
+
"current_step": progress["current_step"],
|
103 |
+
"completed_steps": progress["completed_steps"]
|
104 |
}
|
105 |
|
106 |
@router.get("/evaluation-results/{session_id}")
|
backend/tasks/evaluation_task.py
CHANGED
@@ -42,21 +42,35 @@ class EvaluationTask:
|
|
42 |
self.timeout = timeout if timeout is not None else DEFAULT_EVALUATION_TIMEOUT
|
43 |
self.current_step = "initializing"
|
44 |
self.completed_steps = []
|
|
|
45 |
|
46 |
# Nettoyer les anciens résultats si demandé
|
47 |
if clean_old_results:
|
48 |
self.clean_old_results()
|
49 |
|
50 |
-
def update_step(self, step: str) -> None:
|
51 |
"""
|
52 |
-
Update the current step and completed steps
|
53 |
|
54 |
Args:
|
55 |
step: Name of the step to update
|
56 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
self.current_step = step
|
|
|
|
|
|
|
58 |
if step not in self.completed_steps:
|
59 |
self.completed_steps.append(step)
|
|
|
|
|
60 |
|
61 |
def get_progress(self) -> Dict:
|
62 |
"""
|
@@ -270,7 +284,7 @@ TASKS_TABLE = [yourbench]
|
|
270 |
]
|
271 |
|
272 |
# Step 1: Check available providers for each model
|
273 |
-
self.update_step("finding_available_model_providers")
|
274 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Checking available providers for models...")
|
275 |
|
276 |
model_providers = {}
|
@@ -288,11 +302,11 @@ TASKS_TABLE = [yourbench]
|
|
288 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Found providers for {len(model_providers)} models")
|
289 |
|
290 |
# Step 2: Run evaluations in parallel
|
291 |
-
self.update_step("starting_evaluation_process")
|
292 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting evaluation process...")
|
293 |
|
294 |
# Step 3: Evaluate models
|
295 |
-
self.update_step("evaluating_models")
|
296 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluating models...")
|
297 |
|
298 |
tasks = []
|
@@ -306,13 +320,13 @@ TASKS_TABLE = [yourbench]
|
|
306 |
self.results = [r for r in results if r["status"] == "success"]
|
307 |
|
308 |
# Step 4: Save results
|
309 |
-
self.update_step("storing_evaluation_results")
|
310 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Storing evaluation results...")
|
311 |
self._save_results_to_hub()
|
312 |
|
313 |
# Mark task as completed
|
314 |
self.is_completed = True
|
315 |
-
self.update_step("completed")
|
316 |
|
317 |
total_time = time.time() - script_start_time
|
318 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluation completed in {total_time:.2f}s")
|
|
|
42 |
self.timeout = timeout if timeout is not None else DEFAULT_EVALUATION_TIMEOUT
|
43 |
self.current_step = "initializing"
|
44 |
self.completed_steps = []
|
45 |
+
self.step_start_time = time.time() # Enregistrer le temps de début de l'étape actuelle
|
46 |
|
47 |
# Nettoyer les anciens résultats si demandé
|
48 |
if clean_old_results:
|
49 |
self.clean_old_results()
|
50 |
|
51 |
+
async def update_step(self, step: str) -> None:
|
52 |
"""
|
53 |
+
Update the current step and completed steps with a minimum delay of 1 second
|
54 |
|
55 |
Args:
|
56 |
step: Name of the step to update
|
57 |
"""
|
58 |
+
# Calculer le temps écoulé depuis le début de l'étape précédente
|
59 |
+
elapsed_since_step_start = time.time() - self.step_start_time
|
60 |
+
|
61 |
+
# Si moins d'une seconde s'est écoulée, attendre pour compléter la seconde
|
62 |
+
if elapsed_since_step_start < 1.0:
|
63 |
+
await asyncio.sleep(1.0 - elapsed_since_step_start)
|
64 |
+
|
65 |
+
# Mettre à jour l'étape courante et enregistrer le nouvel horodatage
|
66 |
self.current_step = step
|
67 |
+
self.step_start_time = time.time()
|
68 |
+
|
69 |
+
# Ajouter aux étapes complétées si nécessaire
|
70 |
if step not in self.completed_steps:
|
71 |
self.completed_steps.append(step)
|
72 |
+
|
73 |
+
print(f"[{datetime.now().strftime('%H:%M:%S')}] Step changed to: {step}")
|
74 |
|
75 |
def get_progress(self) -> Dict:
|
76 |
"""
|
|
|
284 |
]
|
285 |
|
286 |
# Step 1: Check available providers for each model
|
287 |
+
await self.update_step("finding_available_model_providers")
|
288 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Checking available providers for models...")
|
289 |
|
290 |
model_providers = {}
|
|
|
302 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Found providers for {len(model_providers)} models")
|
303 |
|
304 |
# Step 2: Run evaluations in parallel
|
305 |
+
await self.update_step("starting_evaluation_process")
|
306 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting evaluation process...")
|
307 |
|
308 |
# Step 3: Evaluate models
|
309 |
+
await self.update_step("evaluating_models")
|
310 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluating models...")
|
311 |
|
312 |
tasks = []
|
|
|
320 |
self.results = [r for r in results if r["status"] == "success"]
|
321 |
|
322 |
# Step 4: Save results
|
323 |
+
await self.update_step("storing_evaluation_results")
|
324 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Storing evaluation results...")
|
325 |
self._save_results_to_hub()
|
326 |
|
327 |
# Mark task as completed
|
328 |
self.is_completed = True
|
329 |
+
await self.update_step("completed")
|
330 |
|
331 |
total_time = time.time() - script_start_time
|
332 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluation completed in {total_time:.2f}s")
|
frontend/src/components/BenchmarkEvaluation.jsx
CHANGED
@@ -49,19 +49,18 @@ const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
|
|
49 |
|
50 |
// Add effect to handle starting messages
|
51 |
useEffect(() => {
|
52 |
-
|
53 |
-
|
|
|
|
|
54 |
setStartingMessageIndex((prev) => {
|
55 |
if (prev < STARTING_MESSAGES.length - 1) {
|
56 |
return prev + 1;
|
57 |
}
|
58 |
return prev;
|
59 |
});
|
60 |
-
},
|
61 |
-
|
62 |
-
? MESSAGE_CHANGE_INTERVAL.PRECALCULATED
|
63 |
-
: MESSAGE_CHANGE_INTERVAL.DEFAULT
|
64 |
-
);
|
65 |
|
66 |
return () => {
|
67 |
if (startingMessageIntervalRef.current) {
|
@@ -116,16 +115,24 @@ const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
|
|
116 |
clearInterval(startingMessageIntervalRef.current);
|
117 |
}
|
118 |
} else {
|
119 |
-
// Si l'évaluation est toujours en cours,
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
)
|
126 |
-
|
127 |
-
|
128 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
}
|
130 |
}
|
131 |
} catch (error) {
|
@@ -191,6 +198,35 @@ const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
|
|
191 |
].join(":");
|
192 |
};
|
193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
// Start benchmark evaluation
|
195 |
const startEvaluation = async () => {
|
196 |
if (!sessionId) {
|
@@ -242,25 +278,25 @@ const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
|
|
242 |
clearInterval(startingMessageIntervalRef.current);
|
243 |
}
|
244 |
} else {
|
245 |
-
//
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
setStartingMessageIndex(estimatedStepIndex);
|
265 |
}
|
266 |
}
|
|
|
49 |
|
50 |
// Add effect to handle starting messages
|
51 |
useEffect(() => {
|
52 |
+
// Ne configurer l'intervalle automatique que pour les documents par défaut
|
53 |
+
// Pour les évaluations réelles, on se fiera uniquement aux mises à jour de l'API
|
54 |
+
if (isDefault) {
|
55 |
+
startingMessageIntervalRef.current = setInterval(() => {
|
56 |
setStartingMessageIndex((prev) => {
|
57 |
if (prev < STARTING_MESSAGES.length - 1) {
|
58 |
return prev + 1;
|
59 |
}
|
60 |
return prev;
|
61 |
});
|
62 |
+
}, MESSAGE_CHANGE_INTERVAL.PRECALCULATED);
|
63 |
+
}
|
|
|
|
|
|
|
64 |
|
65 |
return () => {
|
66 |
if (startingMessageIntervalRef.current) {
|
|
|
115 |
clearInterval(startingMessageIntervalRef.current);
|
116 |
}
|
117 |
} else {
|
118 |
+
// Si l'évaluation est toujours en cours, utiliser l'étape actuelle du backend
|
119 |
+
if (logsResult.current_step) {
|
120 |
+
// Utiliser la fonction de mappage pour déterminer l'index du message
|
121 |
+
const newIndex = mapStepToMessageIndex(
|
122 |
+
logsResult.current_step
|
123 |
+
);
|
124 |
+
setStartingMessageIndex(newIndex);
|
125 |
+
} else {
|
126 |
+
// Fallback basé sur le temps si l'étape n'est pas disponible
|
127 |
+
const progress = Math.min(
|
128 |
+
Math.floor(
|
129 |
+
(Date.now() - startTimeRef.current) /
|
130 |
+
MESSAGE_CHANGE_INTERVAL.DEFAULT
|
131 |
+
),
|
132 |
+
STARTING_MESSAGES.length - 1
|
133 |
+
);
|
134 |
+
setStartingMessageIndex(progress);
|
135 |
+
}
|
136 |
}
|
137 |
}
|
138 |
} catch (error) {
|
|
|
198 |
].join(":");
|
199 |
};
|
200 |
|
201 |
+
// Fonction pour mapper le nom de l'étape backend vers l'index dans STARTING_MESSAGES
|
202 |
+
const mapStepToMessageIndex = (currentStep) => {
|
203 |
+
switch (currentStep) {
|
204 |
+
case "initializing":
|
205 |
+
return 0;
|
206 |
+
case "finding_available_model_providers":
|
207 |
+
return 1;
|
208 |
+
case "starting_evaluation_process":
|
209 |
+
return 2;
|
210 |
+
case "evaluating_models":
|
211 |
+
return 3;
|
212 |
+
case "storing_evaluation_results":
|
213 |
+
case "completed":
|
214 |
+
return 4;
|
215 |
+
default:
|
216 |
+
// Calculer l'étape en fonction du temps écoulé si l'étape n'est pas reconnue
|
217 |
+
const elapsedSinceStart = Date.now() - startTimeRef.current;
|
218 |
+
const estimatedTotalTime = 80000; // 80 secondes
|
219 |
+
const estimatedProgress = Math.min(
|
220 |
+
elapsedSinceStart / estimatedTotalTime,
|
221 |
+
1
|
222 |
+
);
|
223 |
+
return Math.min(
|
224 |
+
Math.floor(estimatedProgress * STARTING_MESSAGES.length),
|
225 |
+
STARTING_MESSAGES.length - 1
|
226 |
+
);
|
227 |
+
}
|
228 |
+
};
|
229 |
+
|
230 |
// Start benchmark evaluation
|
231 |
const startEvaluation = async () => {
|
232 |
if (!sessionId) {
|
|
|
278 |
clearInterval(startingMessageIntervalRef.current);
|
279 |
}
|
280 |
} else {
|
281 |
+
// Récupérer l'étape actuelle à partir de l'API, si disponible
|
282 |
+
if (logsResult.current_step) {
|
283 |
+
// Utiliser la fonction de mappage pour déterminer l'index du message
|
284 |
+
const newIndex = mapStepToMessageIndex(
|
285 |
+
logsResult.current_step
|
286 |
+
);
|
287 |
+
setStartingMessageIndex(newIndex);
|
288 |
+
} else {
|
289 |
+
// Fallback: Si l'API ne renvoie pas d'étape, estimer en fonction du temps
|
290 |
+
const elapsedSinceStart = Date.now() - startTimeRef.current;
|
291 |
+
const estimatedTotalTime = 80000; // 80 secondes
|
292 |
+
const estimatedProgress = Math.min(
|
293 |
+
elapsedSinceStart / estimatedTotalTime,
|
294 |
+
1
|
295 |
+
);
|
296 |
+
const estimatedStepIndex = Math.min(
|
297 |
+
Math.floor(estimatedProgress * STARTING_MESSAGES.length),
|
298 |
+
STARTING_MESSAGES.length - 1
|
299 |
+
);
|
300 |
setStartingMessageIndex(estimatedStepIndex);
|
301 |
}
|
302 |
}
|