tfrere commited on
Commit
a86c1f9
·
1 Parent(s): ab8d3f5

update intro | fix evaluation

Browse files
backend/routes/benchmark.py CHANGED
@@ -214,7 +214,15 @@ class UnifiedBenchmarkTask:
214
 
215
  # Mark as completed
216
  self.is_completed = True
217
- self._add_log("[SUCCESS] Benchmark process completed successfully")
 
 
 
 
 
 
 
 
218
 
219
  except Exception as config_error:
220
  error_msg = str(config_error)
 
214
 
215
  # Mark as completed
216
  self.is_completed = True
217
+
218
+ # Vérifier si une erreur a été détectée dans les logs du benchmark
219
+ has_error = any("[ERROR]" in log for log in final_logs)
220
+ benchmark_terminated_with_error = any("Benchmark process terminated with error code" in log for log in final_logs)
221
+ benchmark_already_marked_success = any("Benchmark process completed successfully" in log for log in final_logs)
222
+
223
+ # N'ajouter le message de succès que si aucune erreur n'a été détectée
224
+ if not has_error and not benchmark_terminated_with_error and not benchmark_already_marked_success:
225
+ self._add_log("[SUCCESS] Benchmark process completed successfully")
226
 
227
  except Exception as config_error:
228
  error_msg = str(config_error)
backend/tasks/create_bench.py CHANGED
@@ -158,15 +158,19 @@ class CreateBenchTask:
158
  if self.process:
159
  exit_code = self.process.poll()
160
  if exit_code == 0:
 
161
  self._add_log("[SUCCESS] Benchmark process completed successfully")
162
  else:
163
  # Si une erreur de rate limiting a été détectée, afficher un message spécifique
164
  if rate_limit_detected:
165
  self._add_log("[ERROR] Benchmark process failed due to API rate limiting. The demo is under heavy load at the moment.")
166
- else:
167
- self._add_log(f"[ERROR] Benchmark process terminated with error code: {exit_code}")
 
 
168
  except Exception as e:
169
  self._add_log(f"[ERROR] Error during output capture: {str(e)}")
 
170
  finally:
171
  self.is_completed = True
172
  self.is_running_flag.clear()
@@ -201,32 +205,6 @@ class CreateBenchTask:
201
  # Si aucune correspondance n'est trouvée, renvoyer le nom d'origine
202
  return stage_name
203
 
204
- def _simulate_ingestion_process(self) -> None:
205
- """
206
- Simulate the ingestion process for development mode
207
- """
208
- self._add_log("[INFO] Simulating ingestion process")
209
-
210
- # Simuler les étapes avec les mêmes noms que ceux attendus par le frontend
211
- steps = [
212
- ("ingestion", 2),
213
- ("upload_ingest_to_hub", 3),
214
- ("summarization", 2),
215
- ("chunking", 3),
216
- ("single_shot_question_generation", 4)
217
- ]
218
-
219
- for step, delay in steps:
220
- # Ajouter un message de début d'étape
221
- self._add_log(f"[INFO] Processing {step}...")
222
- time.sleep(delay) # Simuler un délai
223
- # Marquer l'étape comme terminée
224
- self._add_log(f"[SUCCESS] Stage completed: {step}")
225
-
226
- # Marquer la tâche comme terminée
227
- self.is_completed = True
228
- self._add_log("[SUCCESS] Benchmark process completed successfully")
229
-
230
  def run(self, token: Optional[str] = None) -> None:
231
  """
232
  Run the ingestion task
@@ -278,12 +256,6 @@ class CreateBenchTask:
278
  env["HF_ORGANIZATION"] = os.getenv("HF_ORGANIZATION", "yourbench")
279
  self._add_log("[INFO] Environment variables exported")
280
 
281
- # In development mode, only simulate ingestion
282
- if os.environ.get("DEVELOPMENT_MODE", "").lower() == "true":
283
- self._add_log("[INFO] Development mode enabled, simulating ingestion")
284
- self._simulate_ingestion_process()
285
- return
286
-
287
  # Start the process
288
  self._add_log(f"[INFO] Executing command: {' '.join(self.command)}")
289
 
 
158
  if self.process:
159
  exit_code = self.process.poll()
160
  if exit_code == 0:
161
+ # Seulement ajouter le message de succès si le code de sortie est 0
162
  self._add_log("[SUCCESS] Benchmark process completed successfully")
163
  else:
164
  # Si une erreur de rate limiting a été détectée, afficher un message spécifique
165
  if rate_limit_detected:
166
  self._add_log("[ERROR] Benchmark process failed due to API rate limiting. The demo is under heavy load at the moment.")
167
+ # else:
168
+ # self._add_log(f"[ERROR] Benchmark process terminated with error code: {exit_code}")
169
+ # Message informatif sur la fin du processus avec erreurs
170
+ self._add_log("[INFO] Benchmark process completed with errors")
171
  except Exception as e:
172
  self._add_log(f"[ERROR] Error during output capture: {str(e)}")
173
+ # Ne pas ajouter de message de succès en cas d'exception
174
  finally:
175
  self.is_completed = True
176
  self.is_running_flag.clear()
 
205
  # Si aucune correspondance n'est trouvée, renvoyer le nom d'origine
206
  return stage_name
207
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  def run(self, token: Optional[str] = None) -> None:
209
  """
210
  Run the ingestion task
 
256
  env["HF_ORGANIZATION"] = os.getenv("HF_ORGANIZATION", "yourbench")
257
  self._add_log("[INFO] Environment variables exported")
258
 
 
 
 
 
 
 
259
  # Start the process
260
  self._add_log(f"[INFO] Executing command: {' '.join(self.command)}")
261
 
backend/tasks/create_bench_config_file.py CHANGED
@@ -124,7 +124,7 @@ class CreateBenchConfigTask:
124
  # "Qwen/Qwen2.5-72B-Instruct"
125
  # "meta-llama/Llama-3.1-8B-Instruct"
126
  # "Qwen/Qwen2.5-32B-Instruct",
127
- # "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
128
  ]
129
 
130
  # Track found models
@@ -167,11 +167,11 @@ class CreateBenchConfigTask:
167
  "model_list": model_list,
168
 
169
  "model_roles": {
170
- "ingestion": ["Qwen/Qwen2.5-32B-Instruct"],
171
- "summarization": ["Qwen/Qwen2.5-32B-Instruct"],
172
  "chunking": ["intfloat/multilingual-e5-large-instruct"],
173
- "single_shot_question_generation": ["Qwen/Qwen2.5-32B-Instruct"],
174
- "multi_hop_question_generation": ["Qwen/Qwen2.5-32B-Instruct"],
175
  },
176
  "pipeline": {
177
  "ingestion": {
 
124
  # "Qwen/Qwen2.5-72B-Instruct"
125
  # "meta-llama/Llama-3.1-8B-Instruct"
126
  # "Qwen/Qwen2.5-32B-Instruct",
127
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
128
  ]
129
 
130
  # Track found models
 
167
  "model_list": model_list,
168
 
169
  "model_roles": {
170
+ "ingestion": ["deepseek-ai/DeepSeek-R1-Distill-Llama-70B"],
171
+ "summarization": ["deepseek-ai/DeepSeek-R1-Distill-Llama-70B"],
172
  "chunking": ["intfloat/multilingual-e5-large-instruct"],
173
+ "single_shot_question_generation": ["deepseek-ai/DeepSeek-R1-Distill-Llama-70B"],
174
+ "multi_hop_question_generation": ["deepseek-ai/DeepSeek-R1-Distill-Llama-70B"],
175
  },
176
  "pipeline": {
177
  "ingestion": {
frontend/src/components/Benchmark/Generator.jsx CHANGED
@@ -513,7 +513,7 @@ const Generator = ({ sessionId, isDefaultDocument, onComplete }) => {
513
  fontWeight: 500,
514
  }}
515
  >
516
- Estimated time: ~ 1m30s
517
  </Typography>
518
  </Box>
519
 
 
513
  fontWeight: 500,
514
  }}
515
  >
516
+ Estimated time ~ 1m30s
517
  </Typography>
518
  </Box>
519
 
frontend/src/components/Evaluation/Evaluation.jsx CHANGED
@@ -1,12 +1,43 @@
1
- import React from "react";
2
  import { Box, Typography, CircularProgress, Alert, Paper } from "@mui/material";
3
  import { useNavigate, useSearchParams } from "react-router-dom";
4
  import ErrorOutlineIcon from "@mui/icons-material/ErrorOutline";
5
- import { useSimulation } from "./hooks/useSimulation";
6
  import { useTimer } from "./hooks/useTimer";
7
  import { useEvaluation } from "./hooks/useEvaluation";
8
  import ErrorDisplay from "../common/ErrorDisplay";
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
11
  const [searchParams] = useSearchParams();
12
  const isDefault =
@@ -14,18 +45,14 @@ const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
14
  ["the-bitter-lesson", "hurricane-faq", "pokemon-guide"].includes(sessionId);
15
 
16
  const navigate = useNavigate();
 
 
 
 
 
17
 
18
  // Use our custom hooks
19
  const { formatElapsedTime, stopTimer } = useTimer();
20
- const {
21
- startingMessageIndex,
22
- evaluationComplete: simulationComplete,
23
- currentMessage,
24
- } = useSimulation(() => {
25
- if (onComplete) {
26
- onComplete();
27
- }
28
- });
29
  const {
30
  error,
31
  evaluationComplete: realComplete,
@@ -40,22 +67,57 @@ const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
40
  }
41
  });
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  // Handle automatic redirection when evaluation is complete
44
- React.useEffect(() => {
45
  if (realComplete || simulationComplete) {
46
  navigate(`/evaluation-display?session=${sessionId}`);
47
  }
48
  }, [realComplete, simulationComplete, sessionId, navigate]);
49
 
50
  // Start evaluation if not default and not started
51
- React.useEffect(() => {
52
  if (!isDefault && !evaluationStarted) {
53
  startEvaluation();
54
  }
55
  }, [isDefault, evaluationStarted, startEvaluation]);
56
 
57
  // Stop timer when complete
58
- React.useEffect(() => {
59
  if (realComplete || simulationComplete) {
60
  stopTimer();
61
  }
@@ -63,7 +125,7 @@ const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
63
 
64
  const isComplete = realComplete || simulationComplete;
65
  const currentStepInfo = isDefault
66
- ? `${currentMessage.message} (${currentMessage.step}/${currentMessage.totalSteps})`
67
  : `${currentStepLabel} (${currentStep + 1}/${totalSteps})`;
68
 
69
  return (
 
1
+ import React, { useState, useEffect, useRef } from "react";
2
  import { Box, Typography, CircularProgress, Alert, Paper } from "@mui/material";
3
  import { useNavigate, useSearchParams } from "react-router-dom";
4
  import ErrorOutlineIcon from "@mui/icons-material/ErrorOutline";
 
5
  import { useTimer } from "./hooks/useTimer";
6
  import { useEvaluation } from "./hooks/useEvaluation";
7
  import ErrorDisplay from "../common/ErrorDisplay";
8
 
9
+ // Messages de simulation pour les documents par défaut avec timing personnalisé
10
+ const SIMULATION_MESSAGES = [
11
+ {
12
+ message: "Initializing evaluation environment",
13
+ step: 1,
14
+ totalSteps: 5,
15
+ timing: 0,
16
+ }, // Immédiat
17
+ {
18
+ message: "Finding available model providers",
19
+ step: 2,
20
+ totalSteps: 5,
21
+ timing: 1500,
22
+ }, // Après 1.5s
23
+ {
24
+ message: "Starting evaluation process",
25
+ step: 3,
26
+ totalSteps: 5,
27
+ timing: 3000,
28
+ }, // Après 3s
29
+ { message: "Evaluating models", step: 4, totalSteps: 5, timing: 5000 }, // Après 5s
30
+ {
31
+ message: "Storing evaluation results",
32
+ step: 5,
33
+ totalSteps: 5,
34
+ timing: 7000,
35
+ }, // Après 7s
36
+ ];
37
+
38
+ // Durée totale pour rediriger à la fin de la simulation
39
+ const TOTAL_SIMULATION_DURATION = 8000; // 8 secondes
40
+
41
  const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
42
  const [searchParams] = useSearchParams();
43
  const isDefault =
 
45
  ["the-bitter-lesson", "hurricane-faq", "pokemon-guide"].includes(sessionId);
46
 
47
  const navigate = useNavigate();
48
+ const simulationTimeoutsRef = useRef([]);
49
+
50
+ // États pour la simulation
51
+ const [simulationStep, setSimulationStep] = useState(0);
52
+ const [simulationComplete, setSimulationComplete] = useState(false);
53
 
54
  // Use our custom hooks
55
  const { formatElapsedTime, stopTimer } = useTimer();
 
 
 
 
 
 
 
 
 
56
  const {
57
  error,
58
  evaluationComplete: realComplete,
 
67
  }
68
  });
69
 
70
+ // Gérer la simulation pour les documents par défaut
71
+ useEffect(() => {
72
+ // Seulement exécuter pour les documents par défaut et seulement une fois
73
+ if (!isDefault || simulationTimeoutsRef.current.length > 0) return;
74
+
75
+ console.log("Starting simulation for default document:", sessionId);
76
+
77
+ // Créer des timeouts pour chaque étape avec les timings personnalisés
78
+ for (let i = 1; i < SIMULATION_MESSAGES.length; i++) {
79
+ const messageData = SIMULATION_MESSAGES[i];
80
+ const timeout = setTimeout(() => {
81
+ console.log(`Simulation step ${i + 1}: ${messageData.message}`);
82
+ setSimulationStep(i);
83
+ }, messageData.timing);
84
+
85
+ simulationTimeoutsRef.current.push(timeout);
86
+ }
87
+
88
+ // Timeout final pour terminer la simulation
89
+ const finalTimeout = setTimeout(() => {
90
+ console.log("Simulation complete, redirecting");
91
+ setSimulationComplete(true);
92
+ if (onComplete) {
93
+ onComplete();
94
+ }
95
+ }, TOTAL_SIMULATION_DURATION);
96
+
97
+ simulationTimeoutsRef.current.push(finalTimeout);
98
+
99
+ // Nettoyage au démontage
100
+ return () => {
101
+ simulationTimeoutsRef.current.forEach(clearTimeout);
102
+ };
103
+ }, [isDefault, sessionId, onComplete]);
104
+
105
  // Handle automatic redirection when evaluation is complete
106
+ useEffect(() => {
107
  if (realComplete || simulationComplete) {
108
  navigate(`/evaluation-display?session=${sessionId}`);
109
  }
110
  }, [realComplete, simulationComplete, sessionId, navigate]);
111
 
112
  // Start evaluation if not default and not started
113
+ useEffect(() => {
114
  if (!isDefault && !evaluationStarted) {
115
  startEvaluation();
116
  }
117
  }, [isDefault, evaluationStarted, startEvaluation]);
118
 
119
  // Stop timer when complete
120
+ useEffect(() => {
121
  if (realComplete || simulationComplete) {
122
  stopTimer();
123
  }
 
125
 
126
  const isComplete = realComplete || simulationComplete;
127
  const currentStepInfo = isDefault
128
+ ? `${SIMULATION_MESSAGES[simulationStep].message} (${SIMULATION_MESSAGES[simulationStep].step}/${SIMULATION_MESSAGES[simulationStep].totalSteps})`
129
  : `${currentStepLabel} (${currentStep + 1}/${totalSteps})`;
130
 
131
  return (
frontend/src/components/Evaluation/hooks/useSimulation.js CHANGED
@@ -1,7 +1,8 @@
1
  import { useState, useRef, useEffect } from "react";
2
 
3
  // Simulation time in milliseconds for pre-calculated documents
4
- const SIMULATION_DURATION = 120000; // 2 minutes
 
5
 
6
  // Starting messages with their timing
7
  const STARTING_MESSAGES = [
@@ -12,44 +13,46 @@ const STARTING_MESSAGES = [
12
  { message: "Storing evaluation results", step: 5, totalSteps: 5 },
13
  ];
14
 
15
- export const useSimulation = (onComplete) => {
16
  const [startingMessageIndex, setStartingMessageIndex] = useState(0);
17
  const [evaluationComplete, setEvaluationComplete] = useState(false);
18
- const simulationTimeoutRef = useRef(null);
19
- const startingMessageIntervalRef = useRef(null);
20
 
 
21
  useEffect(() => {
22
- // Configure automatic interval for message changes
23
- startingMessageIntervalRef.current = setInterval(() => {
24
- setStartingMessageIndex((prev) => {
25
- if (prev < STARTING_MESSAGES.length - 1) {
26
- return prev + 1;
27
- }
28
- return prev;
29
- });
30
- }, SIMULATION_DURATION / STARTING_MESSAGES.length);
31
-
32
- // Complete after simulation duration
33
- simulationTimeoutRef.current = setTimeout(() => {
 
 
 
 
 
 
 
34
  setEvaluationComplete(true);
35
- if (startingMessageIntervalRef.current) {
36
- clearInterval(startingMessageIntervalRef.current);
37
- }
38
- setStartingMessageIndex(STARTING_MESSAGES.length - 1);
39
  if (onComplete) {
40
  onComplete();
41
  }
42
  }, SIMULATION_DURATION);
43
 
 
 
44
  return () => {
45
- if (simulationTimeoutRef.current) {
46
- clearTimeout(simulationTimeoutRef.current);
47
- }
48
- if (startingMessageIntervalRef.current) {
49
- clearInterval(startingMessageIntervalRef.current);
50
- }
51
  };
52
- }, [onComplete]);
53
 
54
  return {
55
  startingMessageIndex,
 
1
  import { useState, useRef, useEffect } from "react";
2
 
3
  // Simulation time in milliseconds for pre-calculated documents
4
+ const SIMULATION_DURATION = 8000; // 8 secondes au total
5
+ const STEP_DURATION = SIMULATION_DURATION / 5; // Durée de chaque étape
6
 
7
  // Starting messages with their timing
8
  const STARTING_MESSAGES = [
 
13
  { message: "Storing evaluation results", step: 5, totalSteps: 5 },
14
  ];
15
 
16
+ export const useSimulation = (onComplete, shouldStart = false) => {
17
  const [startingMessageIndex, setStartingMessageIndex] = useState(0);
18
  const [evaluationComplete, setEvaluationComplete] = useState(false);
19
+ const timeoutsRef = useRef([]);
20
+ const hasInitializedRef = useRef(false);
21
 
22
+ // Effet pour démarrer la simulation si shouldStart est true
23
  useEffect(() => {
24
+ if (!shouldStart || hasInitializedRef.current) return;
25
+
26
+ // Marquer comme initialisé
27
+ hasInitializedRef.current = true;
28
+ console.log("Simulation starting with shouldStart =", shouldStart);
29
+
30
+ // Programmer des timeouts séquentiels pour chaque étape
31
+ for (let i = 1; i < STARTING_MESSAGES.length; i++) {
32
+ const timeout = setTimeout(() => {
33
+ console.log(`Setting message index to ${i}`);
34
+ setStartingMessageIndex(i);
35
+ }, i * STEP_DURATION);
36
+
37
+ timeoutsRef.current.push(timeout);
38
+ }
39
+
40
+ // Programmer la fin de la simulation
41
+ const completeTimeout = setTimeout(() => {
42
+ console.log("Completing simulation");
43
  setEvaluationComplete(true);
 
 
 
 
44
  if (onComplete) {
45
  onComplete();
46
  }
47
  }, SIMULATION_DURATION);
48
 
49
+ timeoutsRef.current.push(completeTimeout);
50
+
51
  return () => {
52
+ // Nettoyer tous les timeouts lors du démontage
53
+ timeoutsRef.current.forEach(clearTimeout);
 
 
 
 
54
  };
55
+ }, [shouldStart, onComplete]);
56
 
57
  return {
58
  startingMessageIndex,
frontend/src/components/Intro.jsx CHANGED
@@ -43,8 +43,12 @@ const Intro = () => {
43
  <b>domain-specific benchmarks</b> in a <b>zero-shot</b> manner. It aims
44
  to keep your large language models on their toes—even as new data
45
  sources, domains, and knowledge demands evolve.
46
- <br />
47
- <br /> Currently, this is an <b>extremely minimal demo</b>. <br />
 
 
 
 
48
  To <b>unlock the full capabilities</b>, please visit our{" "}
49
  <Link
50
  href="https://github.com/yourbench"
 
43
  <b>domain-specific benchmarks</b> in a <b>zero-shot</b> manner. It aims
44
  to keep your large language models on their toes—even as new data
45
  sources, domains, and knowledge demands evolve.
46
+ </Typography>
47
+ <Typography
48
+ variant="body2"
49
+ sx={{ maxWidth: "800px", mx: "auto", mt: 2, opacity: 0.4 }}
50
+ >
51
+ Currently, this is an <b>extremely minimal demo</b>. <br />
52
  To <b>unlock the full capabilities</b>, please visit our{" "}
53
  <Link
54
  href="https://github.com/yourbench"