tfrere commited on
Commit
4fb52f5
·
1 Parent(s): 0e34dc4

add moder provider switching to eval

Browse files
backend/benchmark_results.json DELETED
@@ -1,139 +0,0 @@
1
- {
2
- "timestamp": "2025-04-01T10:30:15.307581",
3
- "models": {
4
- "Qwen/Qwen2.5-72B-Instruct": [
5
- {
6
- "provider": "sambanova",
7
- "total_time": 21.616381883621216,
8
- "success_rate": 1.0,
9
- "average_time": 4.323276376724243
10
- },
11
- {
12
- "provider": "together",
13
- "total_time": 21.84441828727722,
14
- "success_rate": 1.0,
15
- "average_time": 4.368883657455444
16
- },
17
- {
18
- "provider": "nebius",
19
- "total_time": 22.003292322158813,
20
- "success_rate": 1.0,
21
- "average_time": 4.400658464431762
22
- },
23
- {
24
- "provider": "fireworks-ai",
25
- "total_time": 22.086440563201904,
26
- "success_rate": 1.0,
27
- "average_time": 4.417288112640381
28
- },
29
- {
30
- "provider": "novita",
31
- "total_time": 22.16641402244568,
32
- "success_rate": 1.0,
33
- "average_time": 4.433282804489136
34
- },
35
- {
36
- "provider": "hf-inference",
37
- "total_time": 22.41838788986206,
38
- "success_rate": 1.0,
39
- "average_time": 4.483677577972412
40
- },
41
- {
42
- "provider": "hyperbolic",
43
- "total_time": 23.555410146713257,
44
- "success_rate": 1.0,
45
- "average_time": 4.711082029342651
46
- }
47
- ],
48
- "meta-llama/Llama-3.3-70B-Instruct": [
49
- {
50
- "provider": "novita",
51
- "total_time": 28.36034393310547,
52
- "success_rate": 1.0,
53
- "average_time": 5.672068786621094
54
- },
55
- {
56
- "provider": "fireworks-ai",
57
- "total_time": 31.595482110977173,
58
- "success_rate": 1.0,
59
- "average_time": 6.319096422195434
60
- },
61
- {
62
- "provider": "sambanova",
63
- "total_time": 31.845455646514893,
64
- "success_rate": 1.0,
65
- "average_time": 6.369091129302978
66
- },
67
- {
68
- "provider": "nebius",
69
- "total_time": 31.963874578475952,
70
- "success_rate": 1.0,
71
- "average_time": 6.39277491569519
72
- },
73
- {
74
- "provider": "hyperbolic",
75
- "total_time": 35.02063775062561,
76
- "success_rate": 1.0,
77
- "average_time": 7.004127550125122
78
- },
79
- {
80
- "provider": "together",
81
- "total_time": 36.88544726371765,
82
- "success_rate": 1.0,
83
- "average_time": 7.3770894527435305
84
- },
85
- {
86
- "provider": "hf-inference",
87
- "total_time": 37.26896572113037,
88
- "success_rate": 1.0,
89
- "average_time": 7.453793144226074
90
- },
91
- {
92
- "provider": "cerebras",
93
- "total_time": 37.70701003074646,
94
- "success_rate": 1.0,
95
- "average_time": 7.541402006149292
96
- }
97
- ],
98
- "deepseek-ai/DeepSeek-R1-Distill-Llama-70B": null,
99
- "Qwen/QwQ-32B": [
100
- {
101
- "provider": "sambanova",
102
- "total_time": 25.050092935562134,
103
- "success_rate": 1.0,
104
- "average_time": 5.010018587112427
105
- },
106
- {
107
- "provider": "novita",
108
- "total_time": 25.061633110046387,
109
- "success_rate": 1.0,
110
- "average_time": 5.012326622009278
111
- },
112
- {
113
- "provider": "hyperbolic",
114
- "total_time": 25.363604307174683,
115
- "success_rate": 1.0,
116
- "average_time": 5.072720861434936
117
- },
118
- {
119
- "provider": "nebius",
120
- "total_time": 25.37495517730713,
121
- "success_rate": 1.0,
122
- "average_time": 5.074991035461426
123
- },
124
- {
125
- "provider": "hf-inference",
126
- "total_time": 25.41055965423584,
127
- "success_rate": 1.0,
128
- "average_time": 5.082111930847168
129
- },
130
- {
131
- "provider": "fireworks-ai",
132
- "total_time": 25.595581769943237,
133
- "success_rate": 1.0,
134
- "average_time": 5.119116353988647
135
- }
136
- ],
137
- "mistralai/Mistral-Small-24B-Instruct-2501": null
138
- }
139
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/results.json DELETED
The diff for this file is too large to render. See raw diff
 
backend/routes/cleanup.py CHANGED
@@ -2,28 +2,38 @@ from fastapi import APIRouter, HTTPException
2
  import os
3
  import shutil
4
  from .upload import session_files
 
5
 
6
  router = APIRouter(tags=["cleanup"])
7
 
8
- # Dossier racine pour les uploads
9
  UPLOAD_ROOT = "uploaded_files"
10
 
11
- # Liste des documents de base qui ne doivent pas être supprimés
12
  BASE_DOCUMENTS = ["the-bitter-lesson", "hurricane-faq", "pokemon-guide"]
13
 
14
  @router.delete("/cleanup-session/{session_id}")
15
  async def cleanup_session(session_id: str):
16
  """
17
- Supprime le dossier de session après que l'utilisateur a visualisé les résultats d'évaluation.
18
- Ne supprime pas les documents de base.
 
19
 
20
  Args:
21
- session_id: ID de la session à supprimer
22
 
23
  Returns:
24
- Dictionary avec statut et message
25
  """
26
- # Vérifier si le session_id existe et n'est pas un document de base
 
 
 
 
 
 
 
 
27
  if session_id in BASE_DOCUMENTS:
28
  return {
29
  "success": False,
@@ -32,7 +42,7 @@ async def cleanup_session(session_id: str):
32
 
33
  session_dir = os.path.join(UPLOAD_ROOT, session_id)
34
 
35
- # Vérifier si le dossier existe
36
  if not os.path.exists(session_dir):
37
  return {
38
  "success": False,
@@ -40,11 +50,11 @@ async def cleanup_session(session_id: str):
40
  }
41
 
42
  try:
43
- # Supprimer la référence du fichier de session
44
  if session_id in session_files:
45
  del session_files[session_id]
46
 
47
- # Supprimer le dossier de session
48
  shutil.rmtree(session_dir)
49
 
50
  return {
 
2
  import os
3
  import shutil
4
  from .upload import session_files
5
+ import logging
6
 
7
  router = APIRouter(tags=["cleanup"])
8
 
9
+ # Root directory for uploads
10
  UPLOAD_ROOT = "uploaded_files"
11
 
12
+ # List of base documents that should not be deleted
13
  BASE_DOCUMENTS = ["the-bitter-lesson", "hurricane-faq", "pokemon-guide"]
14
 
15
  @router.delete("/cleanup-session/{session_id}")
16
  async def cleanup_session(session_id: str):
17
  """
18
+ Removes the session directory after the user has viewed the evaluation results.
19
+ Does not remove base documents.
20
+ In development mode, does nothing and returns a log message.
21
 
22
  Args:
23
+ session_id: ID of the session to delete
24
 
25
  Returns:
26
+ Dictionary with status and message
27
  """
28
+ # Check if we are in development mode
29
+ if os.environ.get("ENVIRONEMENT", "").lower() == "development":
30
+ logging.info(f"[DEV MODE] Cleanup called for session: {session_id} - No action taken in development mode")
31
+ return {
32
+ "success": True,
33
+ "message": f"Development mode - cleanup skipped for session: {session_id}"
34
+ }
35
+
36
+ # Check if the session_id exists and is not a base document
37
  if session_id in BASE_DOCUMENTS:
38
  return {
39
  "success": False,
 
42
 
43
  session_dir = os.path.join(UPLOAD_ROOT, session_id)
44
 
45
+ # Check if the directory exists
46
  if not os.path.exists(session_dir):
47
  return {
48
  "success": False,
 
50
  }
51
 
52
  try:
53
+ # Remove the session file reference
54
  if session_id in session_files:
55
  del session_files[session_id]
56
 
57
+ # Remove the session directory
58
  shutil.rmtree(session_dir)
59
 
60
  return {
backend/tasks/evaluation_task.py CHANGED
@@ -13,11 +13,12 @@ import json
13
  import shutil
14
  from typing import List, Dict
15
  from tasks.get_model_providers import get_model_providers
 
16
  from huggingface_hub import HfApi
17
  import asyncio
18
 
19
  # Valeur par défaut du timeout
20
- DEFAULT_EVALUATION_TIMEOUT = 60.0 # 1 minute par défaut
21
 
22
  class EvaluationTask:
23
  """
@@ -40,11 +41,36 @@ class EvaluationTask:
40
  self.results = []
41
  self.hf_api = HfApi()
42
  self.timeout = timeout if timeout is not None else DEFAULT_EVALUATION_TIMEOUT
 
 
43
 
44
  # Nettoyer les anciens résultats si demandé
45
  if clean_old_results:
46
  self.clean_old_results()
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def clean_old_results(self) -> None:
49
  """
50
  Clean old evaluation results to avoid confusion
@@ -239,57 +265,73 @@ TASKS_TABLE = [yourbench]
239
  models = [
240
  "Qwen/QwQ-32B",
241
  "Qwen/Qwen2.5-72B-Instruct",
242
- "deepseek-ai/DeepSeek-V3-0324",
243
  "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
 
244
  ]
245
 
246
- # Get providers for each model
247
- model_providers = get_model_providers(models)
 
 
 
 
 
 
 
 
 
248
 
249
- print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting parallel evaluations")
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
- # Run evaluations in parallel using asyncio
252
  tasks = []
253
- for model_name, providers in model_providers:
254
- if providers: # Only run if providers are available
255
- tasks.append(self._run_lighteval(model_name, providers[0]))
256
 
257
- self.results = await asyncio.gather(*tasks)
258
-
259
- # Calculate total script execution time
260
- total_time = time.time() - script_start_time
261
- print(f"[{datetime.now().strftime('%H:%M:%S')}] All evaluations completed in {total_time:.2f}s")
262
 
263
- # Cleanup intermediate results if they exist
264
- if os.path.exists("data/lighteval_results"):
265
- print(f"[{datetime.now().strftime('%H:%M:%S')}] Cleaning up intermediate results")
266
- try:
267
- # Recursively delete intermediate results
268
- import shutil
269
- shutil.rmtree("data/lighteval_results", ignore_errors=True)
270
- except Exception as e:
271
- print(f"[{datetime.now().strftime('%H:%M:%S')}] Warning: Failed to clean up intermediate results: {str(e)}")
272
 
273
- # Save final results to Hub (only once)
 
 
274
  self._save_results_to_hub()
275
 
276
- # Mark the task as completed
277
  self.is_completed = True
 
 
 
 
278
 
279
  def get_logs(self) -> List[str]:
280
  """
281
- Get logs for this task (empty list since we don't track logs anymore)
282
 
283
  Returns:
284
- Empty list of logs
285
  """
286
- return []
287
 
288
  def is_task_completed(self) -> bool:
289
  """
290
  Check if the task is completed
291
 
292
  Returns:
293
- True if completed, False otherwise
294
  """
295
  return self.is_completed
 
13
  import shutil
14
  from typing import List, Dict
15
  from tasks.get_model_providers import get_model_providers
16
+ from tasks.get_available_model_provider import get_available_model_provider
17
  from huggingface_hub import HfApi
18
  import asyncio
19
 
20
  # Valeur par défaut du timeout
21
+ DEFAULT_EVALUATION_TIMEOUT = 70.0 # 1 minute par défaut
22
 
23
  class EvaluationTask:
24
  """
 
41
  self.results = []
42
  self.hf_api = HfApi()
43
  self.timeout = timeout if timeout is not None else DEFAULT_EVALUATION_TIMEOUT
44
+ self.current_step = "initializing"
45
+ self.completed_steps = []
46
 
47
  # Nettoyer les anciens résultats si demandé
48
  if clean_old_results:
49
  self.clean_old_results()
50
 
51
+ def update_step(self, step: str) -> None:
52
+ """
53
+ Update the current step and completed steps
54
+
55
+ Args:
56
+ step: Name of the step to update
57
+ """
58
+ self.current_step = step
59
+ if step not in self.completed_steps:
60
+ self.completed_steps.append(step)
61
+
62
+ def get_progress(self) -> Dict:
63
+ """
64
+ Get the current progress of the task
65
+
66
+ Returns:
67
+ Dictionary containing current step and completed steps
68
+ """
69
+ return {
70
+ "current_step": self.current_step,
71
+ "completed_steps": self.completed_steps
72
+ }
73
+
74
  def clean_old_results(self) -> None:
75
  """
76
  Clean old evaluation results to avoid confusion
 
265
  models = [
266
  "Qwen/QwQ-32B",
267
  "Qwen/Qwen2.5-72B-Instruct",
268
+ "meta-llama/Llama-3.3-70B-Instruct",
269
  "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
270
+ "mistralai/Mistral-Small-24B-Instruct-2501",
271
  ]
272
 
273
+ # Step 1: Check available providers for each model
274
+ self.update_step("finding_available_model_providers")
275
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Checking available providers for models...")
276
+
277
+ model_providers = {}
278
+ for model in models:
279
+ provider = get_available_model_provider(model, verbose=True)
280
+ if provider:
281
+ model_providers[model] = provider
282
+ else:
283
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] No available provider found for {model}")
284
 
285
+ if not model_providers:
286
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] No models with available providers found")
287
+ return
288
+
289
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Found providers for {len(model_providers)} models")
290
+
291
+ # Step 2: Run evaluations in parallel
292
+ self.update_step("starting_evaluation_process")
293
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting evaluation process...")
294
+
295
+ # Step 3: Evaluate models
296
+ self.update_step("evaluating_models")
297
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluating models...")
298
 
 
299
  tasks = []
300
+ for model, provider in model_providers.items():
301
+ tasks.append(self._run_lighteval(model, provider))
 
302
 
303
+ # Run all evaluations in parallel
304
+ results = await asyncio.gather(*tasks)
 
 
 
305
 
306
+ # Filter out failed evaluations
307
+ self.results = [r for r in results if r["status"] == "success"]
 
 
 
 
 
 
 
308
 
309
+ # Step 4: Save results
310
+ self.update_step("storing_evaluation_results")
311
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Storing evaluation results...")
312
  self._save_results_to_hub()
313
 
314
+ # Mark task as completed
315
  self.is_completed = True
316
+ self.update_step("completed")
317
+
318
+ total_time = time.time() - script_start_time
319
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluation completed in {total_time:.2f}s")
320
 
321
  def get_logs(self) -> List[str]:
322
  """
323
+ Get the logs of the task
324
 
325
  Returns:
326
+ List of log messages
327
  """
328
+ return self.logs if hasattr(self, "logs") else []
329
 
330
  def is_task_completed(self) -> bool:
331
  """
332
  Check if the task is completed
333
 
334
  Returns:
335
+ True if the task is completed, False otherwise
336
  """
337
  return self.is_completed
frontend/src/components/BenchmarkEvaluation.jsx CHANGED
@@ -4,20 +4,21 @@ import { useNavigate, useSearchParams } from "react-router-dom";
4
  import API_CONFIG from "../config/api";
5
 
6
  // Temps de simulation en millisecondes pour les documents précalculés
7
- const SIMULATION_DURATION = 60000; // 20 secondes
8
 
9
  // Intervalle de changement des messages pour les documents standards vs précalculés
10
  const MESSAGE_CHANGE_INTERVAL = {
11
- DEFAULT: 20000, // 20 secondes pour documents standards
12
- PRECALCULATED: 5000, // 5 secondes pour documents précalculés
13
  };
14
 
15
  // Starting messages with their timing
16
  const STARTING_MESSAGES = [
17
- { message: "Initializing evaluation environment...", step: 1, totalSteps: 4 },
18
- { message: "Starting evaluation process...", step: 2, totalSteps: 4 },
19
- { message: "Evaluating models...", step: 3, totalSteps: 4 },
20
- { message: "Storing evaluation results...", step: 4, totalSteps: 4 },
 
21
  ];
22
 
23
  const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
@@ -29,6 +30,7 @@ const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
29
  const [error, setError] = useState(null);
30
  const [elapsedTime, setElapsedTime] = useState(0);
31
  const [startingMessageIndex, setStartingMessageIndex] = useState(0);
 
32
 
33
  const timerIntervalRef = useRef(null);
34
  const startTimeRef = useRef(null);
@@ -86,7 +88,8 @@ const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
86
  if (
87
  document.visibilityState === "visible" &&
88
  !isDefault &&
89
- !evaluationComplete
 
90
  ) {
91
  console.log("Page became visible, checking evaluation status...");
92
  // Force une nouvelle requête pour récupérer l'état d'évaluation
@@ -140,7 +143,10 @@ const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
140
  if (isDefault) {
141
  simulateEvaluation();
142
  } else {
143
- startEvaluation();
 
 
 
144
  }
145
 
146
  // Clean up intervals on unmount
@@ -156,7 +162,7 @@ const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
156
  }
157
  document.removeEventListener("visibilitychange", handleVisibilityChange);
158
  };
159
- }, [isDefault, sessionId, evaluationComplete]);
160
 
161
  // Simulate the evaluation process for pre-calculated documents
162
  const simulateEvaluation = () => {
@@ -192,6 +198,9 @@ const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
192
  return;
193
  }
194
 
 
 
 
195
  try {
196
  // Call API to start evaluation
197
  const response = await fetch(
@@ -307,7 +316,7 @@ const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
307
  fontWeight: 500,
308
  }}
309
  >
310
- Estimated time: ~1 min
311
  </Typography>
312
  </Box>
313
 
 
4
  import API_CONFIG from "../config/api";
5
 
6
  // Temps de simulation en millisecondes pour les documents précalculés
7
+ const SIMULATION_DURATION = 70000; // 20 secondes
8
 
9
  // Intervalle de changement des messages pour les documents standards vs précalculés
10
  const MESSAGE_CHANGE_INTERVAL = {
11
+ DEFAULT: 25000, // 20 secondes pour documents standards
12
+ PRECALCULATED: 25000, // 5 secondes pour documents précalculés
13
  };
14
 
15
  // Starting messages with their timing
16
  const STARTING_MESSAGES = [
17
+ { message: "Initializing evaluation environment...", step: 1, totalSteps: 5 },
18
+ { message: "Finding available model providers...", step: 2, totalSteps: 5 },
19
+ { message: "Starting evaluation process...", step: 3, totalSteps: 5 },
20
+ { message: "Evaluating models...", step: 4, totalSteps: 5 },
21
+ { message: "Storing evaluation results...", step: 5, totalSteps: 5 },
22
  ];
23
 
24
  const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
 
30
  const [error, setError] = useState(null);
31
  const [elapsedTime, setElapsedTime] = useState(0);
32
  const [startingMessageIndex, setStartingMessageIndex] = useState(0);
33
+ const [evaluationStarted, setEvaluationStarted] = useState(false);
34
 
35
  const timerIntervalRef = useRef(null);
36
  const startTimeRef = useRef(null);
 
88
  if (
89
  document.visibilityState === "visible" &&
90
  !isDefault &&
91
+ !evaluationComplete &&
92
+ evaluationStarted // Vérifier si l'évaluation a déjà commencé
93
  ) {
94
  console.log("Page became visible, checking evaluation status...");
95
  // Force une nouvelle requête pour récupérer l'état d'évaluation
 
143
  if (isDefault) {
144
  simulateEvaluation();
145
  } else {
146
+ // Démarrer l'évaluation seulement si elle n'a pas déjà été lancée
147
+ if (!evaluationStarted) {
148
+ startEvaluation();
149
+ }
150
  }
151
 
152
  // Clean up intervals on unmount
 
162
  }
163
  document.removeEventListener("visibilitychange", handleVisibilityChange);
164
  };
165
+ }, [isDefault, sessionId, evaluationComplete, evaluationStarted]);
166
 
167
  // Simulate the evaluation process for pre-calculated documents
168
  const simulateEvaluation = () => {
 
198
  return;
199
  }
200
 
201
+ // Marquer que l'évaluation a commencé
202
+ setEvaluationStarted(true);
203
+
204
  try {
205
  // Call API to start evaluation
206
  const response = await fetch(
 
316
  fontWeight: 500,
317
  }}
318
  >
319
+ Estimated time ~ 1min 30s
320
  </Typography>
321
  </Box>
322
 
frontend/src/components/BenchmarkGenerator.jsx CHANGED
@@ -18,9 +18,6 @@ const BENCHMARK_STEPS = [
18
  "summarization",
19
  "chunking",
20
  "single_shot_question_generation",
21
- "evaluation_provider_check",
22
- "evaluation",
23
- "evaluation_saving_results",
24
  ];
25
 
26
  // Step labels for display (more user-friendly names)
 
18
  "summarization",
19
  "chunking",
20
  "single_shot_question_generation",
 
 
 
21
  ];
22
 
23
  // Step labels for display (more user-friendly names)