tfrere commited on
Commit
7e389db
·
1 Parent(s): 39acd70

cleanup generation logs

Browse files
backend/clean_and_restart_eval.py CHANGED
@@ -93,19 +93,7 @@ async def main(session_id, dataset_name, threshold=None):
93
 
94
 
95
  if __name__ == "__main__":
96
- parser = argparse.ArgumentParser(description="Nettoyage et relance d'évaluation LightEval")
97
- parser.add_argument("session_id", help="ID de la session à nettoyer et réévaluer")
98
- parser.add_argument("--dataset", "-d", dest="dataset_name",
99
- help="Nom du dataset à évaluer (par défaut: basé sur l'ID de session)")
100
- parser.add_argument("--threshold", "-t", dest="threshold", type=int, default=None,
101
- help="Seuil pour l'analyse des sentiments (différence entre mots positifs et négatifs)")
102
-
103
- args = parser.parse_args()
104
-
105
- # Si le nom du dataset n'est pas fourni, le construire à partir de l'ID de session
106
- if not args.dataset_name:
107
- args.dataset_name = f"yourbench/yourbench_{args.session_id}"
108
-
109
  # Exécuter la fonction principale de manière asynchrone
110
- exit_code = asyncio.run(main(args.session_id, args.dataset_name, args.threshold))
111
  sys.exit(exit_code)
 
93
 
94
 
95
  if __name__ == "__main__":
96
+
 
 
 
 
 
 
 
 
 
 
 
 
97
  # Exécuter la fonction principale de manière asynchrone
98
+ exit_code = asyncio.run(main("pokemon-guide", "yourbench/yourbench_hurricane-faq", -1))
99
  sys.exit(exit_code)
backend/examine_judge.py DELETED
@@ -1,115 +0,0 @@
1
- import re
2
- import os
3
- from pprint import pprint
4
-
5
- # Chemin vers le fichier de log du juge
6
- log_file = "lighteval_judge.log"
7
-
8
- # Fonction pour extraire les évaluations du juge
9
- def extract_judge_evaluations(log_content):
10
- # Pattern pour trouver les réponses du juge
11
- pattern = r"Judge response: (.*?)(?=Judge response:|$)"
12
-
13
- # Extraire toutes les réponses
14
- responses = re.findall(pattern, log_content, re.DOTALL)
15
-
16
- # Analyser chaque réponse pour extraire la décision finale
17
- evaluations = []
18
- for i, response in enumerate(responses):
19
- # Chercher la décision finale dans les balises XML
20
- final_answer_match = re.search(r"<final_answer>(.*?)</final_answer>", response, re.DOTALL)
21
-
22
- if final_answer_match:
23
- final_answer = final_answer_match.group(1).strip()
24
- evaluations.append({
25
- "id": i+1,
26
- "final_answer": final_answer,
27
- "full_response": response[:500] + "..." if len(response) > 500 else response
28
- })
29
- else:
30
- # Si pas de balise XML, chercher des mots-clés
31
- if re.search(r"\b(correct|vrai|true|yes|1)\b", response, re.IGNORECASE):
32
- final_answer = "1 (déduit sans balise XML)"
33
- elif re.search(r"\b(incorrect|faux|false|no|0)\b", response, re.IGNORECASE):
34
- final_answer = "0 (déduit sans balise XML)"
35
- else:
36
- final_answer = "Non détecté"
37
-
38
- evaluations.append({
39
- "id": i+1,
40
- "final_answer": final_answer,
41
- "full_response": response[:500] + "..." if len(response) > 500 else response
42
- })
43
-
44
- return evaluations
45
-
46
- # Fonction pour extraire les requêtes envoyées au juge
47
- def extract_judge_prompts(log_content):
48
- # Pattern pour trouver les requêtes
49
- pattern = r"Prompt sent to judge: (.*?)(?=Prompt sent to judge:|Judge response:|$)"
50
-
51
- # Extraire toutes les requêtes
52
- prompts = re.findall(pattern, log_content, re.DOTALL)
53
-
54
- # Analyser chaque requête
55
- analyzed_prompts = []
56
- for i, prompt in enumerate(prompts):
57
- # Extraire les questions, réponses et réponses de référence
58
- question_match = re.search(r"<question>(.*?)</question>", prompt, re.DOTALL)
59
- model_answer_match = re.search(r"<model_answer>(.*?)</model_answer>", prompt, re.DOTALL)
60
- gold_answer_match = re.search(r"<gold_answer>(.*?)</gold_answer>", prompt, re.DOTALL)
61
-
62
- question = question_match.group(1).strip() if question_match else "Non détecté"
63
- model_answer = model_answer_match.group(1).strip() if model_answer_match else "Non détecté"
64
- gold_answer = gold_answer_match.group(1).strip() if gold_answer_match else "Non détecté"
65
-
66
- analyzed_prompts.append({
67
- "id": i+1,
68
- "question": question,
69
- "model_answer": model_answer[:200] + "..." if len(model_answer) > 200 else model_answer,
70
- "gold_answer": gold_answer[:200] + "..." if len(gold_answer) > 200 else gold_answer
71
- })
72
-
73
- return analyzed_prompts
74
-
75
- # Lire le fichier de log
76
- if os.path.exists(log_file):
77
- with open(log_file, 'r', encoding='utf-8') as f:
78
- log_content = f.read()
79
-
80
- # Extraire les évaluations
81
- evaluations = extract_judge_evaluations(log_content)
82
-
83
- # Extraire les prompts
84
- prompts = extract_judge_prompts(log_content)
85
-
86
- # Afficher le résumé des évaluations
87
- print(f"Nombre total d'évaluations: {len(evaluations)}")
88
- print("\nRésumé des décisions:")
89
- decisions = {}
90
- for eval in evaluations:
91
- decision = eval["final_answer"]
92
- decisions[decision] = decisions.get(decision, 0) + 1
93
-
94
- for decision, count in decisions.items():
95
- print(f" {decision}: {count} fois ({count/len(evaluations)*100:.1f}%)")
96
-
97
- # Afficher les détails des évaluations
98
- print("\n" + "="*80)
99
- print("DÉTAIL DES COMPARAISONS QUESTION/RÉPONSE/RÉFÉRENCE/DÉCISION")
100
- print("="*80 + "\n")
101
-
102
- for i in range(min(len(prompts), len(evaluations))):
103
- prompt = prompts[i]
104
- eval = evaluations[i]
105
-
106
- print(f"EXEMPLE {i+1}:")
107
- print(f"Question: {prompt['question']}")
108
- print(f"\nRéponse du modèle: {prompt['model_answer']}")
109
- print(f"\nRéponse de référence: {prompt['gold_answer']}")
110
- print(f"\nDécision du juge: {eval['final_answer']}")
111
- print(f"\nExtrait de la réponse complète du juge:")
112
- print(eval['full_response'][:300] + "..." if len(eval['full_response']) > 300 else eval['full_response'])
113
- print("\n" + "-"*80 + "\n")
114
- else:
115
- print(f"Fichier de log {log_file} non trouvé.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/examine_parquet.py DELETED
@@ -1,50 +0,0 @@
1
- import pandas as pd
2
- import sys
3
- from pprint import pprint
4
- import numpy as np
5
-
6
- # Chemin vers le fichier parquet
7
- parquet_file = "data/lighteval_results/details/Qwen/Qwen2.5-72B-Instruct/2025-04-01T13-30-41.628107/details_custom|yourbench|0_2025-04-01T13-30-41.628107.parquet"
8
-
9
- # Charger le fichier parquet
10
- df = pd.read_parquet(parquet_file)
11
-
12
- # Afficher des informations de base
13
- print(f"Nombre total d'exemples: {len(df)}")
14
- print(f"Colonnes disponibles: {', '.join(df.columns)}")
15
- print(f"Métriques d'accuracy: {df['metrics'].tolist()}")
16
- print("\n" + "="*80 + "\n")
17
-
18
- # Examiner quelques exemples plus en détail
19
- for i in range(min(3, len(df))):
20
- print(f"EXEMPLE {i+1}:")
21
- print(f"Question: {df.iloc[i].specifics.get('question', 'N/A')}")
22
- print(f"Réponse du modèle: {df.iloc[i].predictions[0]}")
23
- print(f"Réponse de référence (choice): {df.iloc[i].choices[0]}")
24
- print(f"Gold index: {df.iloc[i].gold_index}")
25
-
26
- # Afficher le document
27
- print("\nDocument:")
28
- doc = df.iloc[i].specifics.get('document', 'N/A')
29
- print(doc[:500] + "..." if len(doc) > 500 else doc)
30
-
31
- # Afficher les chunks
32
- print("\nChunks:")
33
- chunks = df.iloc[i].specifics.get('chunks', None)
34
- if chunks is not None and len(chunks) > 0:
35
- for j in range(len(chunks)):
36
- chunk_text = chunks[j]
37
- if isinstance(chunk_text, str):
38
- print(f" Chunk {j+1}: {chunk_text[:300]}..." if len(chunk_text) > 300 else f" Chunk {j+1}: {chunk_text}")
39
- else:
40
- print(f" Chunk {j+1}: {type(chunk_text)}")
41
- else:
42
- print(" Aucun chunk disponible")
43
-
44
- # Afficher d'autres métadonnées
45
- print("\nMétadonnées:")
46
- print(f" Catégorie de question: {df.iloc[i].specifics.get('question_category', 'N/A')}")
47
- print(f" Difficulté estimée: {df.iloc[i].specifics.get('estimated_difficulty', 'N/A')}")
48
- print(f" Modèle générateur de question: {df.iloc[i].specifics.get('question_generating_model', 'N/A')}")
49
-
50
- print("\n" + "="*80 + "\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/examine_results.py DELETED
@@ -1,70 +0,0 @@
1
- import pandas as pd
2
- import sys
3
- import re
4
- import difflib
5
- from pprint import pprint
6
-
7
- # Chemin vers le fichier parquet
8
- parquet_file = "data/lighteval_results/details/Qwen/Qwen2.5-72B-Instruct/2025-04-01T13-30-41.628107/details_custom|yourbench|0_2025-04-01T13-30-41.628107.parquet"
9
-
10
- # Fonction pour nettoyer les réponses (enlever balises XML, espaces, etc.)
11
- def clean_response(response):
12
- # Enlever les balises XML
13
- response = re.sub(r'<answer>(.*?)</answer>', r'\1', response, flags=re.DOTALL)
14
- # Normaliser les espaces
15
- response = ' '.join(response.split())
16
- return response.lower().strip()
17
-
18
- # Charger le fichier parquet
19
- df = pd.read_parquet(parquet_file)
20
-
21
- # Afficher des informations de base
22
- print(f"Nombre total d'exemples: {len(df)}")
23
- print(f"Tous les scores: {[metric.get('accuracy', 'N/A') for metric in df['metrics']]}")
24
- print("\n" + "="*80 + "\n")
25
-
26
- # Analyser la similarité entre les réponses du modèle et les réponses de référence
27
- print("ANALYSE DE SIMILARITÉ ENTRE RÉPONSES MODÈLE ET RÉPONSES DE RÉFÉRENCE\n")
28
-
29
- total_correct_content = 0
30
-
31
- for i in range(len(df)):
32
- # Extraire les réponses
33
- model_answer = df.iloc[i].predictions[0] if len(df.iloc[i].predictions) > 0 else "N/A"
34
- reference_answer = df.iloc[i].choices[0] if len(df.iloc[i].choices) > 0 else "N/A"
35
- question = df.iloc[i].specifics.get('question', 'N/A')
36
-
37
- # Nettoyer les réponses pour comparaison
38
- clean_model = clean_response(model_answer)
39
- clean_reference = clean_response(reference_answer)
40
-
41
- # Calculer la similarité
42
- similarity = difflib.SequenceMatcher(None, clean_model, clean_reference).ratio()
43
-
44
- # Vérifier si les éléments clés de la réponse de référence sont dans la réponse du modèle
45
- key_terms = clean_reference.split()
46
- important_terms = [term for term in key_terms if len(term) > 4] # Mots de plus de 4 lettres
47
-
48
- terms_found = sum(1 for term in important_terms if term in clean_model)
49
- term_coverage = terms_found / len(important_terms) if important_terms else 0
50
-
51
- # Définir si le contenu de la réponse est correct (utiliser un seuil)
52
- is_content_correct = term_coverage > 0.5 or similarity > 0.4
53
- if is_content_correct:
54
- total_correct_content += 1
55
-
56
- # Afficher les résultats
57
- print(f"EXEMPLE {i+1}:")
58
- print(f"Question: {question}")
59
- print(f"Réponse du modèle (nettoyée): {clean_model[:150]}..." if len(clean_model) > 150 else f"Réponse du modèle (nettoyée): {clean_model}")
60
- print(f"Réponse de référence (nettoyée): {clean_reference}")
61
- print(f"Ratio de similarité: {similarity:.2f}")
62
- print(f"Couverture des termes importants: {term_coverage:.2f} ({terms_found}/{len(important_terms)})")
63
- print(f"Contenu de la réponse jugé correct? {'OUI' if is_content_correct else 'NON'}")
64
-
65
- # Quelques informations supplémentaires
66
- print(f"Métrique LightEval: {df.iloc[i].metrics.get('accuracy', 'N/A')}")
67
- print("-"*80 + "\n")
68
-
69
- print(f"RÉSUMÉ: {total_correct_content}/{len(df)} réponses ({total_correct_content/len(df)*100:.1f}%) ont un contenu jugé correct selon notre analyse simple.")
70
- print(f"Comparé à LightEval: {sum(metric.get('accuracy', 0) for metric in df['metrics'])}/{len(df)} réponses correctes.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/examine_strict_results.py DELETED
@@ -1,71 +0,0 @@
1
- import pandas as pd
2
- import sys
3
- import re
4
- from pprint import pprint
5
-
6
- # Chemins vers les fichiers parquet
7
- parquet_file_original = "data/lighteval_results/details/Qwen/Qwen2.5-72B-Instruct/2025-04-01T13-30-41.628107/details_custom|yourbench|0_2025-04-01T13-30-41.628107.parquet"
8
- parquet_file_strict = "data/lighteval_results_strict_20250401_134031/details/Qwen/Qwen2.5-72B-Instruct/2025-04-01T13-41-03.260648/details_custom|yourbench|0_2025-04-01T13-41-03.260648.parquet"
9
-
10
- try:
11
- # Charger les fichiers parquet
12
- print("Chargement des données...")
13
- df_original = pd.read_parquet(parquet_file_original)
14
- df_strict = pd.read_parquet(parquet_file_strict)
15
-
16
- # Afficher des informations de base
17
- print(f"Nombre d'exemples originaux: {len(df_original)}")
18
- print(f"Nombre d'exemples stricts: {len(df_strict)}")
19
- print(f"Scores originaux: {[metric.get('accuracy', 'N/A') for metric in df_original['metrics']]}")
20
- print(f"Scores stricts: {[metric.get('accuracy', 'N/A') for metric in df_strict['metrics']]}")
21
-
22
- print("\n" + "="*80 + "\n")
23
- print("COMPARAISON DES RÉSULTATS")
24
- print("="*80 + "\n")
25
-
26
- # Comparer les résultats
27
- for i in range(min(len(df_original), len(df_strict))):
28
- print(f"EXEMPLE {i+1}:")
29
-
30
- # Question
31
- question_orig = df_original.iloc[i].specifics.get('question', 'N/A')
32
- question_strict = df_strict.iloc[i].specifics.get('question', 'N/A')
33
- print(f"Question: {question_orig}")
34
-
35
- # Évaluation
36
- score_orig = df_original.iloc[i].metrics.get('accuracy', 'N/A')
37
- score_strict = df_strict.iloc[i].metrics.get('accuracy', 'N/A')
38
- print(f"Score original: {score_orig}")
39
- print(f"Score strict: {score_strict}")
40
-
41
- # Réponses
42
- model_answer_orig = df_original.iloc[i].predictions[0] if len(df_original.iloc[i].predictions) > 0 else "N/A"
43
- model_answer_strict = df_strict.iloc[i].predictions[0] if len(df_strict.iloc[i].predictions) > 0 else "N/A"
44
-
45
- # Référence
46
- reference_orig = df_original.iloc[i].choices[0] if len(df_original.iloc[i].choices) > 0 else "N/A"
47
- reference_strict = df_strict.iloc[i].choices[0] if len(df_strict.iloc[i].choices) > 0 else "N/A"
48
-
49
- # Comparaison des réponses - si identiques ou différentes
50
- responses_identical = model_answer_orig == model_answer_strict
51
- references_identical = reference_orig == reference_strict
52
-
53
- print(f"Réponses du modèle identiques: {'Oui' if responses_identical else 'Non'}")
54
- print(f"Références identiques: {'Oui' if references_identical else 'Non'}")
55
-
56
- # Afficher le changement qui a mené à une modification du résultat
57
- if score_orig != score_strict:
58
- print(f"\nRaison possible du changement de score:")
59
- print(f" Critères d'évaluation plus stricts dans le prompt système")
60
- print(f" Rejet des réponses contenant des nuances (however, but, although, etc.)")
61
-
62
- print("-"*80 + "\n")
63
-
64
- except Exception as e:
65
- print(f"Erreur: {e}")
66
-
67
- if "df_original" in locals():
68
- print("\nColonnes dans df_original:", df_original.columns.tolist())
69
-
70
- if "df_strict" in locals():
71
- print("\nColonnes dans df_strict:", df_strict.columns.tolist())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/main.py CHANGED
@@ -2,7 +2,7 @@ from fastapi import FastAPI, UploadFile, File, Form, Request, HTTPException
2
  from fastapi.middleware.cors import CORSMiddleware
3
  import os
4
  from dotenv import load_dotenv
5
- from routes import routers, session_files, active_bench_tasks, benchmark
6
 
7
  # Load environment variables from .env file
8
  load_dotenv()
 
2
  from fastapi.middleware.cors import CORSMiddleware
3
  import os
4
  from dotenv import load_dotenv
5
+ from routes import routers, session_files, active_tasks, benchmark
6
 
7
  # Load environment variables from .env file
8
  load_dotenv()
backend/routes/__init__.py CHANGED
@@ -1,7 +1,7 @@
1
  # Routes du module
2
  from .health import router as health_router
3
  from .upload import router as upload_router, session_files
4
- from .benchmark import router as benchmark_router, active_bench_tasks
5
  from .questions import router as questions_router
6
  from .download import router as download_router
7
  from .evaluation import router as evaluation_router, active_evaluation_tasks
@@ -20,4 +20,4 @@ routers = [
20
  benchmark_router.session_files = session_files
21
 
22
  # Exposer les variables partagées pour main.py
23
- __all__ = ['routers', 'session_files', 'active_bench_tasks', 'active_evaluation_tasks']
 
1
  # Routes du module
2
  from .health import router as health_router
3
  from .upload import router as upload_router, session_files
4
+ from .benchmark import router as benchmark_router, active_tasks
5
  from .questions import router as questions_router
6
  from .download import router as download_router
7
  from .evaluation import router as evaluation_router, active_evaluation_tasks
 
20
  benchmark_router.session_files = session_files
21
 
22
  # Exposer les variables partagées pour main.py
23
+ __all__ = ['routers', 'session_files', 'active_tasks', 'active_evaluation_tasks']
backend/routes/benchmark.py CHANGED
@@ -8,8 +8,7 @@ from tasks.create_bench import CreateBenchTask
8
  router = APIRouter(tags=["benchmark"])
9
 
10
  # Store active tasks by session_id (importé dans main.py)
11
- active_bench_tasks = {}
12
- active_config_tasks = {}
13
 
14
  # Référence aux session_files (sera fournie par main.py)
15
  # Cette déclaration sera écrasée par l'affectation dans __init__.py
@@ -24,7 +23,7 @@ async def generate_benchmark(data: Dict[str, Any]):
24
  data: Dictionary containing session_id
25
 
26
  Returns:
27
- Dictionary with logs and config_path
28
  """
29
  session_id = data.get("session_id")
30
 
@@ -39,23 +38,20 @@ async def generate_benchmark(data: Dict[str, Any]):
39
  all_logs = []
40
 
41
  try:
42
- # Step 1: Generate configuration file
43
- config_task = CreateBenchConfigTask(session_uid=session_id)
44
- # Store the config task for later log retrieval
45
- active_config_tasks[session_id] = config_task
46
 
47
- # Start configuration generation asynchronously
48
- config_path = config_task.run(file_path=file_path)
49
 
50
- # Add initial logs
51
- all_logs.extend(config_task.get_logs())
52
 
53
- # Step 2: Run the createBench task with the generated config
54
- # Note: This will be started by a separate endpoint once configuration is done
55
 
56
  return {
57
  "status": "running",
58
- "config_path": config_path,
59
  "logs": all_logs
60
  }
61
  except Exception as e:
@@ -65,10 +61,10 @@ async def generate_benchmark(data: Dict[str, Any]):
65
  "logs": all_logs
66
  }
67
 
68
- @router.get("/config-logs/{session_id}")
69
- async def get_config_logs(session_id: str):
70
  """
71
- Get the logs for a running configuration task
72
 
73
  Args:
74
  session_id: Session ID for the task
@@ -76,57 +72,131 @@ async def get_config_logs(session_id: str):
76
  Returns:
77
  Dictionary with logs and completion status
78
  """
79
- if session_id not in active_config_tasks:
80
- raise HTTPException(status_code=404, detail="Configuration task not found")
81
 
82
- config_task = active_config_tasks[session_id]
83
- logs = config_task.get_logs()
84
- is_completed = config_task.is_task_completed()
85
-
86
- # Si la configuration est terminée et que le benchmark n'est pas encore démarré,
87
- # démarrer automatiquement le benchmark
88
- if is_completed and session_id not in active_bench_tasks:
89
- try:
90
- # Ensure the config_path is a string
91
- config_path_str = f"uploaded_files/{session_id}/config.yml"
92
- bench_task = CreateBenchTask(session_uid=session_id, config_path=config_path_str)
93
-
94
- # Store the bench task for later log retrieval
95
- active_bench_tasks[session_id] = bench_task
96
-
97
- # Add a transition log
98
- logs.append("[INFO] Configuration file generated, starting benchmark creation")
99
-
100
- # Run the task
101
- bench_task.run()
102
- except Exception as bench_error:
103
- error_msg = f"Error starting benchmark creation: {str(bench_error)}"
104
- logs.append(f"[ERROR] {error_msg}")
105
 
106
  return {
107
  "logs": logs,
108
  "is_completed": is_completed
109
  }
110
 
111
- @router.get("/benchmark-logs/{session_id}")
112
- async def get_benchmark_logs(session_id: str):
 
 
113
  """
114
- Get the logs for a running benchmark task
115
 
116
- Args:
117
- session_id: Session ID for the task
 
118
 
119
- Returns:
120
- Dictionary with logs and completion status
121
- """
122
- if session_id not in active_bench_tasks:
123
- raise HTTPException(status_code=404, detail="Benchmark task not found")
 
 
 
 
 
124
 
125
- bench_task = active_bench_tasks[session_id]
126
- logs = bench_task.get_logs()
127
- is_completed = bench_task.is_task_completed()
 
 
 
 
 
 
 
 
 
128
 
129
- return {
130
- "logs": logs,
131
- "is_completed": is_completed
132
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  router = APIRouter(tags=["benchmark"])
9
 
10
  # Store active tasks by session_id (importé dans main.py)
11
+ active_tasks = {}
 
12
 
13
  # Référence aux session_files (sera fournie par main.py)
14
  # Cette déclaration sera écrasée par l'affectation dans __init__.py
 
23
  data: Dictionary containing session_id
24
 
25
  Returns:
26
+ Dictionary with logs and status
27
  """
28
  session_id = data.get("session_id")
29
 
 
38
  all_logs = []
39
 
40
  try:
41
+ # Initialiser la tâche qui gérera tout le processus
42
+ task = UnifiedBenchmarkTask(session_uid=session_id)
 
 
43
 
44
+ # Stockage pour récupération ultérieure des logs
45
+ active_tasks[session_id] = task
46
 
47
+ # Démarrer le processus de benchmark
48
+ task.run(file_path)
49
 
50
+ # Récupérer les logs initiaux
51
+ all_logs = task.get_logs()
52
 
53
  return {
54
  "status": "running",
 
55
  "logs": all_logs
56
  }
57
  except Exception as e:
 
61
  "logs": all_logs
62
  }
63
 
64
+ @router.get("/benchmark-progress/{session_id}")
65
+ async def get_benchmark_progress(session_id: str):
66
  """
67
+ Get the logs and status for a running benchmark task
68
 
69
  Args:
70
  session_id: Session ID for the task
 
72
  Returns:
73
  Dictionary with logs and completion status
74
  """
75
+ if session_id not in active_tasks:
76
+ raise HTTPException(status_code=404, detail="Benchmark task not found")
77
 
78
+ task = active_tasks[session_id]
79
+ logs = task.get_logs()
80
+ is_completed = task.is_task_completed()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  return {
83
  "logs": logs,
84
  "is_completed": is_completed
85
  }
86
 
87
+ # Créer une classe qui unifie le processus de benchmark
88
+ class UnifiedBenchmarkTask:
89
+ """
90
+ Task that handles the entire benchmark process from configuration to completion
91
  """
 
92
 
93
+ def __init__(self, session_uid: str):
94
+ """
95
+ Initialize the unified benchmark task
96
 
97
+ Args:
98
+ session_uid: Session ID for this task
99
+ """
100
+ self.session_uid = session_uid
101
+ self.logs = []
102
+ self.is_completed = False
103
+ self.config_task = None
104
+ self.bench_task = None
105
+
106
+ self._add_log("[INFO] Initializing benchmark task")
107
 
108
+ def _add_log(self, message: str):
109
+ """
110
+ Add a log message
111
+
112
+ Args:
113
+ message: Log message to add
114
+ """
115
+ if message not in self.logs: # Éviter les doublons
116
+ self.logs.append(message)
117
+ # Forcer une copie pour éviter les problèmes de référence
118
+ self.logs = self.logs.copy()
119
+ print(f"[{self.session_uid}] {message}")
120
 
121
+ def get_logs(self):
122
+ """
123
+ Get all logs
124
+
125
+ Returns:
126
+ List of log messages
127
+ """
128
+ return self.logs.copy()
129
+
130
+ def is_task_completed(self):
131
+ """
132
+ Check if the task is completed
133
+
134
+ Returns:
135
+ True if completed, False otherwise
136
+ """
137
+ return self.is_completed
138
+
139
+ def run(self, file_path: str):
140
+ """
141
+ Run the benchmark process
142
+
143
+ Args:
144
+ file_path: Path to the uploaded file
145
+ """
146
+ # Démarrer dans un thread séparé pour ne pas bloquer
147
+ import threading
148
+ thread = threading.Thread(target=self._run_process, args=(file_path,))
149
+ thread.daemon = True
150
+ thread.start()
151
+
152
+ def _run_process(self, file_path: str):
153
+ """
154
+ Internal method to run the process
155
+
156
+ Args:
157
+ file_path: Path to the uploaded file
158
+ """
159
+ try:
160
+ # Étape 1: Configuration
161
+ self._add_log("[INFO] Starting configuration process")
162
+ self.config_task = CreateBenchConfigTask(session_uid=self.session_uid)
163
+
164
+ # Exécuter la tâche de configuration
165
+ config_path = self.config_task.run(file_path=file_path)
166
+
167
+ # Récupérer les logs de configuration
168
+ config_logs = self.config_task.get_logs()
169
+ for log in config_logs:
170
+ self._add_log(log)
171
+
172
+ # Marquer l'étape de configuration comme terminée
173
+ if "[SUCCESS] Stage completed: config_generation" not in self.logs:
174
+ self._add_log("[SUCCESS] Stage completed: configuration")
175
+
176
+ # Étape 2: Benchmark
177
+ self._add_log("[INFO] Starting benchmark process")
178
+ self.bench_task = CreateBenchTask(session_uid=self.session_uid, config_path=config_path)
179
+
180
+ # Exécuter la tâche de benchmark
181
+ self.bench_task.run()
182
+
183
+ # Attendre que la tâche de benchmark soit terminée
184
+ while not self.bench_task.is_task_completed():
185
+ # Récupérer les nouveaux logs et les ajouter
186
+ bench_logs = self.bench_task.get_logs()
187
+ for log in bench_logs:
188
+ self._add_log(log)
189
+ time.sleep(1)
190
+
191
+ # Récupérer les logs finaux
192
+ final_logs = self.bench_task.get_logs()
193
+ for log in final_logs:
194
+ self._add_log(log)
195
+
196
+ # Marquer comme terminé
197
+ self.is_completed = True
198
+ self._add_log("[SUCCESS] Benchmark process completed successfully")
199
+
200
+ except Exception as e:
201
+ self._add_log(f"[ERROR] Benchmark process failed: {str(e)}")
202
+ self.is_completed = True
backend/tasks/create_bench.py CHANGED
@@ -132,7 +132,10 @@ class CreateBenchTask:
132
  else:
133
  # Detect completed stages
134
  if "Completed stage:" in line:
135
- stage = line.split("'")[1] if "'" in line else line
 
 
 
136
  self._add_log(f"[SUCCESS] Stage completed: {stage}")
137
  else:
138
  self._add_log(f"[INFO] {line}")
@@ -141,9 +144,9 @@ class CreateBenchTask:
141
  if self.process:
142
  exit_code = self.process.poll()
143
  if exit_code == 0:
144
- self._add_log("[SUCCESS] Ingestion process completed successfully")
145
  else:
146
- self._add_log(f"[ERROR] Ingestion process terminated with error code: {exit_code}")
147
  except Exception as e:
148
  self._add_log(f"[ERROR] Error during output capture: {str(e)}")
149
  finally:
@@ -151,6 +154,61 @@ class CreateBenchTask:
151
  self.is_running_flag.clear()
152
  self._add_log("[INFO] Output capture completed")
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  def run(self, token: Optional[str] = None) -> None:
155
  """
156
  Run the ingestion task
 
132
  else:
133
  # Detect completed stages
134
  if "Completed stage:" in line:
135
+ # Extraire le nom de l'étape
136
+ stage = line.split("'")[1] if "'" in line else line.split("Completed stage:")[1].strip()
137
+ # Standardiser les noms d'étapes pour correspondre au frontend
138
+ stage = self._standardize_stage_name(stage)
139
  self._add_log(f"[SUCCESS] Stage completed: {stage}")
140
  else:
141
  self._add_log(f"[INFO] {line}")
 
144
  if self.process:
145
  exit_code = self.process.poll()
146
  if exit_code == 0:
147
+ self._add_log("[SUCCESS] Benchmark process completed successfully")
148
  else:
149
+ self._add_log(f"[ERROR] Benchmark process terminated with error code: {exit_code}")
150
  except Exception as e:
151
  self._add_log(f"[ERROR] Error during output capture: {str(e)}")
152
  finally:
 
154
  self.is_running_flag.clear()
155
  self._add_log("[INFO] Output capture completed")
156
 
157
+ def _standardize_stage_name(self, stage_name: str) -> str:
158
+ """
159
+ Standardize the stage name to match the frontend expectations
160
+
161
+ Args:
162
+ stage_name: Original stage name
163
+
164
+ Returns:
165
+ Standardized stage name
166
+ """
167
+ # Table de correspondance pour les noms d'étapes
168
+ stage_mapping = {
169
+ # Ajouter ici les correspondances nécessaires
170
+ # exemple: "original_name": "standardized_name"
171
+ "ingest": "ingestion",
172
+ "upload": "upload_ingest_to_hub",
173
+ "summarize": "summarization",
174
+ "chunk": "chunking",
175
+ "generate_questions": "single_shot_question_generation",
176
+ }
177
+
178
+ # Chercher des correspondances partielles
179
+ for key, value in stage_mapping.items():
180
+ if key in stage_name.lower():
181
+ return value
182
+
183
+ # Si aucune correspondance n'est trouvée, renvoyer le nom d'origine
184
+ return stage_name
185
+
186
+ def _simulate_ingestion_process(self) -> None:
187
+ """
188
+ Simulate the ingestion process for development mode
189
+ """
190
+ self._add_log("[INFO] Simulating ingestion process")
191
+
192
+ # Simuler les étapes avec les mêmes noms que ceux attendus par le frontend
193
+ steps = [
194
+ ("ingestion", 2),
195
+ ("upload_ingest_to_hub", 3),
196
+ ("summarization", 2),
197
+ ("chunking", 3),
198
+ ("single_shot_question_generation", 4)
199
+ ]
200
+
201
+ for step, delay in steps:
202
+ # Ajouter un message de début d'étape
203
+ self._add_log(f"[INFO] Processing {step}...")
204
+ time.sleep(delay) # Simuler un délai
205
+ # Marquer l'étape comme terminée
206
+ self._add_log(f"[SUCCESS] Stage completed: {step}")
207
+
208
+ # Marquer la tâche comme terminée
209
+ self.is_completed = True
210
+ self._add_log("[SUCCESS] Benchmark process completed successfully")
211
+
212
  def run(self, token: Optional[str] = None) -> None:
213
  """
214
  Run the ingestion task
frontend/src/components/BenchmarkCreateForm.jsx CHANGED
@@ -188,7 +188,7 @@ function BenchmarkCreateForm({ onStartGeneration }) {
188
  align="center"
189
  sx={{ mb: 2, color: "text.secondary" }}
190
  >
191
- Choose a sample document
192
  </Typography>
193
 
194
  <Grid container spacing={2} sx={{ mb: 2 }}>
 
188
  align="center"
189
  sx={{ mb: 2, color: "text.secondary" }}
190
  >
191
+ To create a benchmark, choose a sample document
192
  </Typography>
193
 
194
  <Grid container spacing={2} sx={{ mb: 2 }}>
frontend/src/components/BenchmarkGenerator.jsx CHANGED
@@ -11,30 +11,30 @@ const SIMULATION_DURATION = 20000; // 20 secondes
11
 
12
  // Define all benchmark steps in sequence
13
  const BENCHMARK_STEPS = [
 
14
  "ingestion",
15
  "upload_ingest_to_hub",
16
  "summarization",
17
  "chunking",
18
  "single_shot_question_generation",
19
- "multi_hop_question_generation",
20
- "lighteval",
21
  ];
22
 
23
  // Step labels for display (more user-friendly names)
24
  const STEP_LABELS = {
 
25
  ingestion: "Ingestion",
26
  upload_ingest_to_hub: "Upload to Hub",
27
  summarization: "Summarization",
28
  chunking: "Chunking",
29
  single_shot_question_generation: "Question generation",
30
- multi_hop_question_generation: "Question generation",
31
- lighteval: "Saving results",
32
  };
33
 
34
  // Simulated log messages for pre-calculated documents
35
  const SIMULATED_LOGS = [
36
  "[INFO] Initializing benchmark generation...",
37
  "[INFO] Generating base configuration file...",
 
 
38
  "[SUCCESS] Stage completed: ingestion",
39
  "[INFO] Processing document content for upload...",
40
  "[SUCCESS] Stage completed: upload_ingest_to_hub",
@@ -44,11 +44,7 @@ const SIMULATED_LOGS = [
44
  "[SUCCESS] Stage completed: chunking",
45
  "[INFO] Generating single-shot questions...",
46
  "[SUCCESS] Stage completed: single_shot_question_generation",
47
- "[INFO] Creating multi-hop questions from content...",
48
- "[SUCCESS] Stage completed: multi_hop_question_generation",
49
- "[INFO] Running LightEval for benchmark validation...",
50
- "[SUCCESS] Stage completed: lighteval",
51
- "[SUCCESS] Ingestion process completed successfully",
52
  ];
53
 
54
  /**
@@ -70,7 +66,7 @@ const BenchmarkGenerator = ({ sessionId, isDefaultDocument, onComplete }) => {
70
  const [error, setError] = useState(null);
71
  const [currentPhase, setCurrentPhase] = useState("initializing");
72
  const [completedSteps, setCompletedSteps] = useState([]);
73
- const [activeStep, setActiveStep] = useState(0);
74
  const [elapsedTime, setElapsedTime] = useState(0);
75
 
76
  // Reference to keep track of the polling interval
@@ -187,7 +183,7 @@ const BenchmarkGenerator = ({ sessionId, isDefaultDocument, onComplete }) => {
187
  setError(null);
188
  setCurrentPhase("initializing");
189
  setCompletedSteps([]);
190
- setActiveStep(0);
191
 
192
  // Timing variables for simulation
193
  const totalSteps = SIMULATED_LOGS.length;
@@ -248,24 +244,37 @@ const BenchmarkGenerator = ({ sessionId, isDefaultDocument, onComplete }) => {
248
  });
249
 
250
  // Déterminer l'étape active basée sur les étapes complétées
251
- let newActiveStep = 0;
 
252
  if (newCompletedSteps.length > 0) {
253
  // Trouver l'étape la plus avancée dans les logs
254
  const maxCompletedStepIndex = Math.max(
255
  ...newCompletedSteps.map((step) => BENCHMARK_STEPS.indexOf(step))
256
  );
257
- newActiveStep = maxCompletedStepIndex + 1;
 
 
 
 
 
 
258
 
259
  // S'assurer que l'activeStep ne dépasse pas le nombre total d'étapes
260
  if (newActiveStep >= BENCHMARK_STEPS.length) {
261
  newActiveStep = BENCHMARK_STEPS.length;
262
  }
 
 
 
263
  }
264
 
265
  // Mettre à jour l'état si les étapes ont changé
266
- // Comparer les tableaux avec JSON.stringify pour une comparaison profonde
267
  if (JSON.stringify(newCompletedSteps) !== JSON.stringify(completedSteps)) {
268
  setCompletedSteps(newCompletedSteps);
 
 
 
 
269
  setActiveStep(newActiveStep);
270
  }
271
 
@@ -278,14 +287,12 @@ const BenchmarkGenerator = ({ sessionId, isDefaultDocument, onComplete }) => {
278
  // Detect completion conditions
279
  const isComplete =
280
  recentLogs.some((log) =>
281
- log.includes("[SUCCESS] Ingestion process completed successfully")
282
  ) ||
283
  recentLogs.some((log) =>
284
- log.includes(
285
- "[SUCCESS] Configuration and ingestion completed successfully"
286
- )
287
  ) ||
288
- newCompletedSteps.includes("lighteval") ||
289
  newActiveStep >= BENCHMARK_STEPS.length;
290
 
291
  if (isComplete) {
@@ -305,7 +312,7 @@ const BenchmarkGenerator = ({ sessionId, isDefaultDocument, onComplete }) => {
305
  });
306
  }
307
  } else if (
308
- recentLogs.some((log) => log.includes("starting benchmark creation"))
309
  ) {
310
  setCurrentPhase("benchmarking");
311
  } else if (
@@ -333,7 +340,7 @@ const BenchmarkGenerator = ({ sessionId, isDefaultDocument, onComplete }) => {
333
  setError(null);
334
  setCurrentPhase("initializing");
335
  setCompletedSteps([]);
336
- setActiveStep(0);
337
 
338
  try {
339
  // Call the API to generate the benchmark
@@ -355,87 +362,43 @@ const BenchmarkGenerator = ({ sessionId, isDefaultDocument, onComplete }) => {
355
  if (response.ok) {
356
  setGenerationLogs(result.logs || []);
357
 
358
- // D'abord, on commence par interroger les logs de configuration
359
- const pollConfigLogs = async () => {
 
 
 
 
 
 
360
  try {
361
- // Call the API to get the config logs
362
- const configLogsResponse = await fetch(
363
- `${API_CONFIG.BASE_URL}/config-logs/${sessionId}`
364
  );
365
 
366
- if (configLogsResponse.ok) {
367
- const configLogsResult = await configLogsResponse.json();
368
 
369
- // Update logs if there are new ones
370
  if (
371
- configLogsResult.logs &&
372
- configLogsResult.logs.length > generationLogs.length
373
  ) {
374
- setGenerationLogs(configLogsResult.logs);
375
  }
376
 
377
- // If config task is completed, switch to polling benchmark logs
378
- if (configLogsResult.is_completed) {
379
- // Attendre un court instant pour permettre au serveur de démarrer le benchmark
380
- setTimeout(() => {
381
- console.log(
382
- "Configuration completed, switching to benchmark polling"
383
- );
384
- clearInterval(configPollingIntervalRef.current);
385
- pollBenchmarkLogs();
386
- }, 1000);
387
  }
388
  }
389
  } catch (error) {
390
- console.log("Error polling for config logs:", error);
391
- // Don't stop polling on network errors
392
  }
393
- };
394
-
395
- // Fonction pour interroger les logs du benchmark
396
- const pollBenchmarkLogs = async () => {
397
- // Set up polling for benchmark logs
398
- pollingIntervalRef.current = setInterval(async () => {
399
- // Check if we already completed
400
- if (generationComplete) {
401
- clearInterval(pollingIntervalRef.current);
402
- return;
403
- }
404
-
405
- try {
406
- // Call the API to get the latest benchmark logs
407
- const logsResponse = await fetch(
408
- `${API_CONFIG.BASE_URL}/benchmark-logs/${sessionId}`
409
- );
410
-
411
- if (logsResponse.ok) {
412
- const logsResult = await logsResponse.json();
413
-
414
- // Update logs if there are new ones
415
- if (
416
- logsResult.logs &&
417
- logsResult.logs.length > generationLogs.length
418
- ) {
419
- setGenerationLogs(logsResult.logs);
420
- }
421
-
422
- // Check if the task is completed
423
- if (logsResult.is_completed) {
424
- setGenerationComplete(true);
425
- clearInterval(pollingIntervalRef.current);
426
- // Notification is now handled in the useEffect above
427
- }
428
- }
429
- } catch (error) {
430
- console.log("Error polling for benchmark logs:", error);
431
- // Don't stop polling on network errors
432
- }
433
- }, 3000); // Poll every 3 seconds
434
- };
435
-
436
- // Démarrer le polling des logs de configuration
437
- const configPollingIntervalRef = { current: null };
438
- configPollingIntervalRef.current = setInterval(pollConfigLogs, 1000); // Poll config logs more frequently (every second)
439
  } else {
440
  // Handle error
441
  setGenerationLogs([`Error: ${result.error || "Unknown error"}`]);
@@ -472,8 +435,8 @@ const BenchmarkGenerator = ({ sessionId, isDefaultDocument, onComplete }) => {
472
  const currentStepIndex = activeStep;
473
 
474
  // If there's no active step yet
475
- if (currentStepIndex === 0 && completedSteps.length === 0) {
476
- return `Starting (0/${totalSteps})`;
477
  }
478
 
479
  // If all steps are completed
 
11
 
12
  // Define all benchmark steps in sequence
13
  const BENCHMARK_STEPS = [
14
+ "configuration",
15
  "ingestion",
16
  "upload_ingest_to_hub",
17
  "summarization",
18
  "chunking",
19
  "single_shot_question_generation",
 
 
20
  ];
21
 
22
  // Step labels for display (more user-friendly names)
23
  const STEP_LABELS = {
24
+ configuration: "Configuration",
25
  ingestion: "Ingestion",
26
  upload_ingest_to_hub: "Upload to Hub",
27
  summarization: "Summarization",
28
  chunking: "Chunking",
29
  single_shot_question_generation: "Question generation",
 
 
30
  };
31
 
32
  // Simulated log messages for pre-calculated documents
33
  const SIMULATED_LOGS = [
34
  "[INFO] Initializing benchmark generation...",
35
  "[INFO] Generating base configuration file...",
36
+ "[SUCCESS] Stage completed: configuration",
37
+ "[INFO] Starting ingestion process...",
38
  "[SUCCESS] Stage completed: ingestion",
39
  "[INFO] Processing document content for upload...",
40
  "[SUCCESS] Stage completed: upload_ingest_to_hub",
 
44
  "[SUCCESS] Stage completed: chunking",
45
  "[INFO] Generating single-shot questions...",
46
  "[SUCCESS] Stage completed: single_shot_question_generation",
47
+ "[SUCCESS] Benchmark process completed successfully",
 
 
 
 
48
  ];
49
 
50
  /**
 
66
  const [error, setError] = useState(null);
67
  const [currentPhase, setCurrentPhase] = useState("initializing");
68
  const [completedSteps, setCompletedSteps] = useState([]);
69
+ const [activeStep, setActiveStep] = useState(1);
70
  const [elapsedTime, setElapsedTime] = useState(0);
71
 
72
  // Reference to keep track of the polling interval
 
183
  setError(null);
184
  setCurrentPhase("initializing");
185
  setCompletedSteps([]);
186
+ setActiveStep(1);
187
 
188
  // Timing variables for simulation
189
  const totalSteps = SIMULATED_LOGS.length;
 
244
  });
245
 
246
  // Déterminer l'étape active basée sur les étapes complétées
247
+ let newActiveStep = activeStep;
248
+
249
  if (newCompletedSteps.length > 0) {
250
  // Trouver l'étape la plus avancée dans les logs
251
  const maxCompletedStepIndex = Math.max(
252
  ...newCompletedSteps.map((step) => BENCHMARK_STEPS.indexOf(step))
253
  );
254
+ // Passer à l'étape suivante
255
+ const calculatedStep = maxCompletedStepIndex + 1;
256
+
257
+ // Ne mettre à jour que si la nouvelle étape est plus avancée que l'étape actuelle
258
+ if (calculatedStep > activeStep) {
259
+ newActiveStep = calculatedStep;
260
+ }
261
 
262
  // S'assurer que l'activeStep ne dépasse pas le nombre total d'étapes
263
  if (newActiveStep >= BENCHMARK_STEPS.length) {
264
  newActiveStep = BENCHMARK_STEPS.length;
265
  }
266
+ } else if (activeStep === 0) {
267
+ // Si aucune étape n'est trouvée et l'étape active est 0, passer à 1
268
+ newActiveStep = 1;
269
  }
270
 
271
  // Mettre à jour l'état si les étapes ont changé
 
272
  if (JSON.stringify(newCompletedSteps) !== JSON.stringify(completedSteps)) {
273
  setCompletedSteps(newCompletedSteps);
274
+ }
275
+
276
+ // Mettre à jour l'étape active seulement si elle a changé
277
+ if (newActiveStep !== activeStep) {
278
  setActiveStep(newActiveStep);
279
  }
280
 
 
287
  // Detect completion conditions
288
  const isComplete =
289
  recentLogs.some((log) =>
290
+ log.includes("[SUCCESS] Benchmark process completed successfully")
291
  ) ||
292
  recentLogs.some((log) =>
293
+ log.includes("[SUCCESS] Ingestion process completed successfully")
 
 
294
  ) ||
295
+ newCompletedSteps.includes("single_shot_question_generation") ||
296
  newActiveStep >= BENCHMARK_STEPS.length;
297
 
298
  if (isComplete) {
 
312
  });
313
  }
314
  } else if (
315
+ recentLogs.some((log) => log.includes("Starting ingestion process"))
316
  ) {
317
  setCurrentPhase("benchmarking");
318
  } else if (
 
340
  setError(null);
341
  setCurrentPhase("initializing");
342
  setCompletedSteps([]);
343
+ setActiveStep(1);
344
 
345
  try {
346
  // Call the API to generate the benchmark
 
362
  if (response.ok) {
363
  setGenerationLogs(result.logs || []);
364
 
365
+ // Configurer le polling pour suivre la progression
366
+ pollingIntervalRef.current = setInterval(async () => {
367
+ // Vérifier si on a déjà terminé
368
+ if (generationComplete) {
369
+ clearInterval(pollingIntervalRef.current);
370
+ return;
371
+ }
372
+
373
  try {
374
+ // Appeler l'API pour obtenir les derniers logs
375
+ const logsResponse = await fetch(
376
+ `${API_CONFIG.BASE_URL}/benchmark-progress/${sessionId}`
377
  );
378
 
379
+ if (logsResponse.ok) {
380
+ const logsResult = await logsResponse.json();
381
 
382
+ // Mettre à jour les logs s'il y en a de nouveaux
383
  if (
384
+ logsResult.logs &&
385
+ logsResult.logs.length > generationLogs.length
386
  ) {
387
+ setGenerationLogs(logsResult.logs);
388
  }
389
 
390
+ // Vérifier si la tâche est terminée
391
+ if (logsResult.is_completed) {
392
+ setGenerationComplete(true);
393
+ clearInterval(pollingIntervalRef.current);
394
+ // La notification est maintenant gérée dans le useEffect ci-dessus
 
 
 
 
 
395
  }
396
  }
397
  } catch (error) {
398
+ console.log("Error polling for logs:", error);
399
+ // Ne pas arrêter le polling en cas d'erreurs réseau
400
  }
401
+ }, 2000); // Interroger toutes les 2 secondes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  } else {
403
  // Handle error
404
  setGenerationLogs([`Error: ${result.error || "Unknown error"}`]);
 
435
  const currentStepIndex = activeStep;
436
 
437
  // If there's no active step yet
438
+ if (currentStepIndex <= 1 && completedSteps.length === 0) {
439
+ return `Starting (1/${totalSteps})`;
440
  }
441
 
442
  // If all steps are completed
frontend/src/components/EvaluationDisplay.jsx CHANGED
@@ -272,8 +272,8 @@ const EvaluationDisplay = ({ sessionId }) => {
272
  alignItems: "center",
273
  }}
274
  >
275
- {model.model_name.length > 20
276
- ? `${model.model_name.substring(0, 20)}...`
277
  : model.model_name}
278
  <OpenInNewIcon sx={{ ml: 0.5, fontSize: 16 }} />
279
  </Link>
 
272
  alignItems: "center",
273
  }}
274
  >
275
+ {model.model_name.length > 40
276
+ ? `${model.model_name.substring(0, 40)}...`
277
  : model.model_name}
278
  <OpenInNewIcon sx={{ ml: 0.5, fontSize: 16 }} />
279
  </Link>