Spaces:

yourbench
/

demo

Running on CPU Upgrade

App Files Files Community

tfrere commited on Apr 1

Commit

7e389db

1 Parent(s): 39acd70

cleanup generation logs

Browse files

Files changed (12) hide show

backend/clean_and_restart_eval.py +2 -14
backend/examine_judge.py +0 -115
backend/examine_parquet.py +0 -50
backend/examine_results.py +0 -70
backend/examine_strict_results.py +0 -71
backend/main.py +1 -1
backend/routes/__init__.py +2 -2
backend/routes/benchmark.py +129 -59
backend/tasks/create_bench.py +61 -3
frontend/src/components/BenchmarkCreateForm.jsx +1 -1
frontend/src/components/BenchmarkGenerator.jsx +55 -92
frontend/src/components/EvaluationDisplay.jsx +2 -2

backend/clean_and_restart_eval.py CHANGED Viewed

@@ -93,19 +93,7 @@ async def main(session_id, dataset_name, threshold=None):
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Nettoyage et relance d'évaluation LightEval")
-    parser.add_argument("session_id", help="ID de la session à nettoyer et réévaluer")
-    parser.add_argument("--dataset", "-d", dest="dataset_name",
-                        help="Nom du dataset à évaluer (par défaut: basé sur l'ID de session)")
-    parser.add_argument("--threshold", "-t", dest="threshold", type=int, default=None,
-                       help="Seuil pour l'analyse des sentiments (différence entre mots positifs et négatifs)")
-    args = parser.parse_args()
-    # Si le nom du dataset n'est pas fourni, le construire à partir de l'ID de session
-    if not args.dataset_name:
-        args.dataset_name = f"yourbench/yourbench_{args.session_id}"
     # Exécuter la fonction principale de manière asynchrone
-    exit_code = asyncio.run(main(args.session_id, args.dataset_name, args.threshold))
     sys.exit(exit_code)

 if __name__ == "__main__":
     # Exécuter la fonction principale de manière asynchrone
+    exit_code = asyncio.run(main("pokemon-guide", "yourbench/yourbench_hurricane-faq", -1))
     sys.exit(exit_code)

backend/examine_judge.py DELETED Viewed

@@ -1,115 +0,0 @@
-import re
-import os
-from pprint import pprint
-# Chemin vers le fichier de log du juge
-log_file = "lighteval_judge.log"
-# Fonction pour extraire les évaluations du juge
-def extract_judge_evaluations(log_content):
-    # Pattern pour trouver les réponses du juge
-    pattern = r"Judge response: (.*?)(?=Judge response:|$)"
-    # Extraire toutes les réponses
-    responses = re.findall(pattern, log_content, re.DOTALL)
-    # Analyser chaque réponse pour extraire la décision finale
-    evaluations = []
-    for i, response in enumerate(responses):
-        # Chercher la décision finale dans les balises XML
-        final_answer_match = re.search(r"<final_answer>(.*?)</final_answer>", response, re.DOTALL)
-        if final_answer_match:
-            final_answer = final_answer_match.group(1).strip()
-            evaluations.append({
-                "id": i+1,
-                "final_answer": final_answer,
-                "full_response": response[:500] + "..." if len(response) > 500 else response
-            })
-        else:
-            # Si pas de balise XML, chercher des mots-clés
-            if re.search(r"\b(correct|vrai|true|yes|1)\b", response, re.IGNORECASE):
-                final_answer = "1 (déduit sans balise XML)"
-            elif re.search(r"\b(incorrect|faux|false|no|0)\b", response, re.IGNORECASE):
-                final_answer = "0 (déduit sans balise XML)"
-            else:
-                final_answer = "Non détecté"
-            evaluations.append({
-                "id": i+1,
-                "final_answer": final_answer,
-                "full_response": response[:500] + "..." if len(response) > 500 else response
-            })
-    return evaluations
-# Fonction pour extraire les requêtes envoyées au juge
-def extract_judge_prompts(log_content):
-    # Pattern pour trouver les requêtes
-    pattern = r"Prompt sent to judge: (.*?)(?=Prompt sent to judge:|Judge response:|$)"
-    # Extraire toutes les requêtes
-    prompts = re.findall(pattern, log_content, re.DOTALL)
-    # Analyser chaque requête
-    analyzed_prompts = []
-    for i, prompt in enumerate(prompts):
-        # Extraire les questions, réponses et réponses de référence
-        question_match = re.search(r"<question>(.*?)</question>", prompt, re.DOTALL)
-        model_answer_match = re.search(r"<model_answer>(.*?)</model_answer>", prompt, re.DOTALL)
-        gold_answer_match = re.search(r"<gold_answer>(.*?)</gold_answer>", prompt, re.DOTALL)
-        question = question_match.group(1).strip() if question_match else "Non détecté"
-        model_answer = model_answer_match.group(1).strip() if model_answer_match else "Non détecté"
-        gold_answer = gold_answer_match.group(1).strip() if gold_answer_match else "Non détecté"
-        analyzed_prompts.append({
-            "id": i+1,
-            "question": question,
-            "model_answer": model_answer[:200] + "..." if len(model_answer) > 200 else model_answer,
-            "gold_answer": gold_answer[:200] + "..." if len(gold_answer) > 200 else gold_answer
-        })
-    return analyzed_prompts
-# Lire le fichier de log
-if os.path.exists(log_file):
-    with open(log_file, 'r', encoding='utf-8') as f:
-        log_content = f.read()
-    # Extraire les évaluations
-    evaluations = extract_judge_evaluations(log_content)
-    # Extraire les prompts
-    prompts = extract_judge_prompts(log_content)
-    # Afficher le résumé des évaluations
-    print(f"Nombre total d'évaluations: {len(evaluations)}")
-    print("\nRésumé des décisions:")
-    decisions = {}
-    for eval in evaluations:
-        decision = eval["final_answer"]
-        decisions[decision] = decisions.get(decision, 0) + 1
-    for decision, count in decisions.items():
-        print(f"  {decision}: {count} fois ({count/len(evaluations)*100:.1f}%)")
-    # Afficher les détails des évaluations
-    print("\n" + "="*80)
-    print("DÉTAIL DES COMPARAISONS QUESTION/RÉPONSE/RÉFÉRENCE/DÉCISION")
-    print("="*80 + "\n")
-    for i in range(min(len(prompts), len(evaluations))):
-        prompt = prompts[i]
-        eval = evaluations[i]
-        print(f"EXEMPLE {i+1}:")
-        print(f"Question: {prompt['question']}")
-        print(f"\nRéponse du modèle: {prompt['model_answer']}")
-        print(f"\nRéponse de référence: {prompt['gold_answer']}")
-        print(f"\nDécision du juge: {eval['final_answer']}")
-        print(f"\nExtrait de la réponse complète du juge:")
-        print(eval['full_response'][:300] + "..." if len(eval['full_response']) > 300 else eval['full_response'])
-        print("\n" + "-"*80 + "\n")
-else:
-    print(f"Fichier de log {log_file} non trouvé.")

backend/examine_parquet.py DELETED Viewed

@@ -1,50 +0,0 @@
-import pandas as pd
-import sys
-from pprint import pprint
-import numpy as np
-# Chemin vers le fichier parquet
-parquet_file = "data/lighteval_results/details/Qwen/Qwen2.5-72B-Instruct/2025-04-01T13-30-41.628107/details_custom|yourbench|0_2025-04-01T13-30-41.628107.parquet"
-# Charger le fichier parquet
-df = pd.read_parquet(parquet_file)
-# Afficher des informations de base
-print(f"Nombre total d'exemples: {len(df)}")
-print(f"Colonnes disponibles: {', '.join(df.columns)}")
-print(f"Métriques d'accuracy: {df['metrics'].tolist()}")
-print("\n" + "="*80 + "\n")
-# Examiner quelques exemples plus en détail
-for i in range(min(3, len(df))):
-    print(f"EXEMPLE {i+1}:")
-    print(f"Question: {df.iloc[i].specifics.get('question', 'N/A')}")
-    print(f"Réponse du modèle: {df.iloc[i].predictions[0]}")
-    print(f"Réponse de référence (choice): {df.iloc[i].choices[0]}")
-    print(f"Gold index: {df.iloc[i].gold_index}")
-    # Afficher le document
-    print("\nDocument:")
-    doc = df.iloc[i].specifics.get('document', 'N/A')
-    print(doc[:500] + "..." if len(doc) > 500 else doc)
-    # Afficher les chunks
-    print("\nChunks:")
-    chunks = df.iloc[i].specifics.get('chunks', None)
-    if chunks is not None and len(chunks) > 0:
-        for j in range(len(chunks)):
-            chunk_text = chunks[j]
-            if isinstance(chunk_text, str):
-                print(f"  Chunk {j+1}: {chunk_text[:300]}..." if len(chunk_text) > 300 else f"  Chunk {j+1}: {chunk_text}")
-            else:
-                print(f"  Chunk {j+1}: {type(chunk_text)}")
-    else:
-        print("  Aucun chunk disponible")
-    # Afficher d'autres métadonnées
-    print("\nMétadonnées:")
-    print(f"  Catégorie de question: {df.iloc[i].specifics.get('question_category', 'N/A')}")
-    print(f"  Difficulté estimée: {df.iloc[i].specifics.get('estimated_difficulty', 'N/A')}")
-    print(f"  Modèle générateur de question: {df.iloc[i].specifics.get('question_generating_model', 'N/A')}")
-    print("\n" + "="*80 + "\n")

backend/examine_results.py DELETED Viewed

@@ -1,70 +0,0 @@
-import pandas as pd
-import sys
-import re
-import difflib
-from pprint import pprint
-# Chemin vers le fichier parquet
-parquet_file = "data/lighteval_results/details/Qwen/Qwen2.5-72B-Instruct/2025-04-01T13-30-41.628107/details_custom|yourbench|0_2025-04-01T13-30-41.628107.parquet"
-# Fonction pour nettoyer les réponses (enlever balises XML, espaces, etc.)
-def clean_response(response):
-    # Enlever les balises XML
-    response = re.sub(r'<answer>(.*?)</answer>', r'\1', response, flags=re.DOTALL)
-    # Normaliser les espaces
-    response = ' '.join(response.split())
-    return response.lower().strip()
-# Charger le fichier parquet
-df = pd.read_parquet(parquet_file)
-# Afficher des informations de base
-print(f"Nombre total d'exemples: {len(df)}")
-print(f"Tous les scores: {[metric.get('accuracy', 'N/A') for metric in df['metrics']]}")
-print("\n" + "="*80 + "\n")
-# Analyser la similarité entre les réponses du modèle et les réponses de référence
-print("ANALYSE DE SIMILARITÉ ENTRE RÉPONSES MODÈLE ET RÉPONSES DE RÉFÉRENCE\n")
-total_correct_content = 0
-for i in range(len(df)):
-    # Extraire les réponses
-    model_answer = df.iloc[i].predictions[0] if len(df.iloc[i].predictions) > 0 else "N/A"
-    reference_answer = df.iloc[i].choices[0] if len(df.iloc[i].choices) > 0 else "N/A"
-    question = df.iloc[i].specifics.get('question', 'N/A')
-    # Nettoyer les réponses pour comparaison
-    clean_model = clean_response(model_answer)
-    clean_reference = clean_response(reference_answer)
-    # Calculer la similarité
-    similarity = difflib.SequenceMatcher(None, clean_model, clean_reference).ratio()
-    # Vérifier si les éléments clés de la réponse de référence sont dans la réponse du modèle
-    key_terms = clean_reference.split()
-    important_terms = [term for term in key_terms if len(term) > 4]  # Mots de plus de 4 lettres
-    terms_found = sum(1 for term in important_terms if term in clean_model)
-    term_coverage = terms_found / len(important_terms) if important_terms else 0
-    # Définir si le contenu de la réponse est correct (utiliser un seuil)
-    is_content_correct = term_coverage > 0.5 or similarity > 0.4
-    if is_content_correct:
-        total_correct_content += 1
-    # Afficher les résultats
-    print(f"EXEMPLE {i+1}:")
-    print(f"Question: {question}")
-    print(f"Réponse du modèle (nettoyée): {clean_model[:150]}..." if len(clean_model) > 150 else f"Réponse du modèle (nettoyée): {clean_model}")
-    print(f"Réponse de référence (nettoyée): {clean_reference}")
-    print(f"Ratio de similarité: {similarity:.2f}")
-    print(f"Couverture des termes importants: {term_coverage:.2f} ({terms_found}/{len(important_terms)})")
-    print(f"Contenu de la réponse jugé correct? {'OUI' if is_content_correct else 'NON'}")
-    # Quelques informations supplémentaires
-    print(f"Métrique LightEval: {df.iloc[i].metrics.get('accuracy', 'N/A')}")
-    print("-"*80 + "\n")
-print(f"RÉSUMÉ: {total_correct_content}/{len(df)} réponses ({total_correct_content/len(df)*100:.1f}%) ont un contenu jugé correct selon notre analyse simple.")
-print(f"Comparé à LightEval: {sum(metric.get('accuracy', 0) for metric in df['metrics'])}/{len(df)} réponses correctes.")

backend/examine_strict_results.py DELETED Viewed

@@ -1,71 +0,0 @@
-import pandas as pd
-import sys
-import re
-from pprint import pprint
-# Chemins vers les fichiers parquet
-parquet_file_original = "data/lighteval_results/details/Qwen/Qwen2.5-72B-Instruct/2025-04-01T13-30-41.628107/details_custom|yourbench|0_2025-04-01T13-30-41.628107.parquet"
-parquet_file_strict = "data/lighteval_results_strict_20250401_134031/details/Qwen/Qwen2.5-72B-Instruct/2025-04-01T13-41-03.260648/details_custom|yourbench|0_2025-04-01T13-41-03.260648.parquet"
-try:
-    # Charger les fichiers parquet
-    print("Chargement des données...")
-    df_original = pd.read_parquet(parquet_file_original)
-    df_strict = pd.read_parquet(parquet_file_strict)
-    # Afficher des informations de base
-    print(f"Nombre d'exemples originaux: {len(df_original)}")
-    print(f"Nombre d'exemples stricts: {len(df_strict)}")
-    print(f"Scores originaux: {[metric.get('accuracy', 'N/A') for metric in df_original['metrics']]}")
-    print(f"Scores stricts: {[metric.get('accuracy', 'N/A') for metric in df_strict['metrics']]}")
-    print("\n" + "="*80 + "\n")
-    print("COMPARAISON DES RÉSULTATS")
-    print("="*80 + "\n")
-    # Comparer les résultats
-    for i in range(min(len(df_original), len(df_strict))):
-        print(f"EXEMPLE {i+1}:")
-        # Question
-        question_orig = df_original.iloc[i].specifics.get('question', 'N/A')
-        question_strict = df_strict.iloc[i].specifics.get('question', 'N/A')
-        print(f"Question: {question_orig}")
-        # Évaluation
-        score_orig = df_original.iloc[i].metrics.get('accuracy', 'N/A')
-        score_strict = df_strict.iloc[i].metrics.get('accuracy', 'N/A')
-        print(f"Score original: {score_orig}")
-        print(f"Score strict: {score_strict}")
-        # Réponses
-        model_answer_orig = df_original.iloc[i].predictions[0] if len(df_original.iloc[i].predictions) > 0 else "N/A"
-        model_answer_strict = df_strict.iloc[i].predictions[0] if len(df_strict.iloc[i].predictions) > 0 else "N/A"
-        # Référence
-        reference_orig = df_original.iloc[i].choices[0] if len(df_original.iloc[i].choices) > 0 else "N/A"
-        reference_strict = df_strict.iloc[i].choices[0] if len(df_strict.iloc[i].choices) > 0 else "N/A"
-        # Comparaison des réponses - si identiques ou différentes
-        responses_identical = model_answer_orig == model_answer_strict
-        references_identical = reference_orig == reference_strict
-        print(f"Réponses du modèle identiques: {'Oui' if responses_identical else 'Non'}")
-        print(f"Références identiques: {'Oui' if references_identical else 'Non'}")
-        # Afficher le changement qui a mené à une modification du résultat
-        if score_orig != score_strict:
-            print(f"\nRaison possible du changement de score:")
-            print(f"  Critères d'évaluation plus stricts dans le prompt système")
-            print(f"  Rejet des réponses contenant des nuances (however, but, although, etc.)")
-        print("-"*80 + "\n")
-except Exception as e:
-    print(f"Erreur: {e}")
-    if "df_original" in locals():
-        print("\nColonnes dans df_original:", df_original.columns.tolist())
-    if "df_strict" in locals():
-        print("\nColonnes dans df_strict:", df_strict.columns.tolist())

backend/main.py CHANGED Viewed

@@ -2,7 +2,7 @@ from fastapi import FastAPI, UploadFile, File, Form, Request, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 import os
 from dotenv import load_dotenv
-from routes import routers, session_files, active_bench_tasks, benchmark
 # Load environment variables from .env file
 load_dotenv()

 from fastapi.middleware.cors import CORSMiddleware
 import os
 from dotenv import load_dotenv
+from routes import routers, session_files, active_tasks, benchmark
 # Load environment variables from .env file
 load_dotenv()

backend/routes/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # Routes du module
 from .health import router as health_router
 from .upload import router as upload_router, session_files
-from .benchmark import router as benchmark_router, active_bench_tasks
 from .questions import router as questions_router
 from .download import router as download_router
 from .evaluation import router as evaluation_router, active_evaluation_tasks
@@ -20,4 +20,4 @@ routers = [
 benchmark_router.session_files = session_files
 # Exposer les variables partagées pour main.py
-__all__ = ['routers', 'session_files', 'active_bench_tasks', 'active_evaluation_tasks']

 # Routes du module
 from .health import router as health_router
 from .upload import router as upload_router, session_files
+from .benchmark import router as benchmark_router, active_tasks
 from .questions import router as questions_router
 from .download import router as download_router
 from .evaluation import router as evaluation_router, active_evaluation_tasks
 benchmark_router.session_files = session_files
 # Exposer les variables partagées pour main.py
+__all__ = ['routers', 'session_files', 'active_tasks', 'active_evaluation_tasks']

backend/routes/benchmark.py CHANGED Viewed

@@ -8,8 +8,7 @@ from tasks.create_bench import CreateBenchTask
 router = APIRouter(tags=["benchmark"])
 # Store active tasks by session_id (importé dans main.py)
-active_bench_tasks = {}
-active_config_tasks = {}
 # Référence aux session_files (sera fournie par main.py)
 # Cette déclaration sera écrasée par l'affectation dans __init__.py
@@ -24,7 +23,7 @@ async def generate_benchmark(data: Dict[str, Any]):
         data: Dictionary containing session_id
     Returns:
-        Dictionary with logs and config_path
     """
     session_id = data.get("session_id")
@@ -39,23 +38,20 @@ async def generate_benchmark(data: Dict[str, Any]):
     all_logs = []
     try:
-        # Step 1: Generate configuration file
-        config_task = CreateBenchConfigTask(session_uid=session_id)
-        # Store the config task for later log retrieval
-        active_config_tasks[session_id] = config_task
-        # Start configuration generation asynchronously
-        config_path = config_task.run(file_path=file_path)
-        # Add initial logs
-        all_logs.extend(config_task.get_logs())
-        # Step 2: Run the createBench task with the generated config
-        # Note: This will be started by a separate endpoint once configuration is done
         return {
             "status": "running",
-            "config_path": config_path,
             "logs": all_logs
         }
     except Exception as e:
@@ -65,10 +61,10 @@ async def generate_benchmark(data: Dict[str, Any]):
             "logs": all_logs
         }
-@router.get("/config-logs/{session_id}")
-async def get_config_logs(session_id: str):
     """
-    Get the logs for a running configuration task
     Args:
         session_id: Session ID for the task
@@ -76,57 +72,131 @@ async def get_config_logs(session_id: str):
     Returns:
         Dictionary with logs and completion status
     """
-    if session_id not in active_config_tasks:
-        raise HTTPException(status_code=404, detail="Configuration task not found")
-    config_task = active_config_tasks[session_id]
-    logs = config_task.get_logs()
-    is_completed = config_task.is_task_completed()
-    # Si la configuration est terminée et que le benchmark n'est pas encore démarré,
-    # démarrer automatiquement le benchmark
-    if is_completed and session_id not in active_bench_tasks:
-        try:
-            # Ensure the config_path is a string
-            config_path_str = f"uploaded_files/{session_id}/config.yml"
-            bench_task = CreateBenchTask(session_uid=session_id, config_path=config_path_str)
-            # Store the bench task for later log retrieval
-            active_bench_tasks[session_id] = bench_task
-            # Add a transition log
-            logs.append("[INFO] Configuration file generated, starting benchmark creation")
-            # Run the task
-            bench_task.run()
-        except Exception as bench_error:
-            error_msg = f"Error starting benchmark creation: {str(bench_error)}"
-            logs.append(f"[ERROR] {error_msg}")
     return {
         "logs": logs,
         "is_completed": is_completed
     }
-@router.get("/benchmark-logs/{session_id}")
-async def get_benchmark_logs(session_id: str):
     """
-    Get the logs for a running benchmark task
-    Args:
-        session_id: Session ID for the task
-    Returns:
-        Dictionary with logs and completion status
-    """
-    if session_id not in active_bench_tasks:
-        raise HTTPException(status_code=404, detail="Benchmark task not found")
-    bench_task = active_bench_tasks[session_id]
-    logs = bench_task.get_logs()
-    is_completed = bench_task.is_task_completed()
-    return {
-        "logs": logs,
-        "is_completed": is_completed
-    }

 router = APIRouter(tags=["benchmark"])
 # Store active tasks by session_id (importé dans main.py)
+active_tasks = {}
 # Référence aux session_files (sera fournie par main.py)
 # Cette déclaration sera écrasée par l'affectation dans __init__.py
         data: Dictionary containing session_id
     Returns:
+        Dictionary with logs and status
     """
     session_id = data.get("session_id")
     all_logs = []
     try:
+        # Initialiser la tâche qui gérera tout le processus
+        task = UnifiedBenchmarkTask(session_uid=session_id)
+        # Stockage pour récupération ultérieure des logs
+        active_tasks[session_id] = task
+        # Démarrer le processus de benchmark
+        task.run(file_path)
+        # Récupérer les logs initiaux
+        all_logs = task.get_logs()
         return {
             "status": "running",
             "logs": all_logs
         }
     except Exception as e:
             "logs": all_logs
         }
+@router.get("/benchmark-progress/{session_id}")
+async def get_benchmark_progress(session_id: str):
     """
+    Get the logs and status for a running benchmark task
     Args:
         session_id: Session ID for the task
     Returns:
         Dictionary with logs and completion status
     """
+    if session_id not in active_tasks:
+        raise HTTPException(status_code=404, detail="Benchmark task not found")
+    task = active_tasks[session_id]
+    logs = task.get_logs()
+    is_completed = task.is_task_completed()
     return {
         "logs": logs,
         "is_completed": is_completed
     }
+# Créer une classe qui unifie le processus de benchmark
+class UnifiedBenchmarkTask:
+    """
+    Task that handles the entire benchmark process from configuration to completion
     """
+    def __init__(self, session_uid: str):
+        """
+        Initialize the unified benchmark task
+        Args:
+            session_uid: Session ID for this task
+        """
+        self.session_uid = session_uid
+        self.logs = []
+        self.is_completed = False
+        self.config_task = None
+        self.bench_task = None
+        self._add_log("[INFO] Initializing benchmark task")
+    def _add_log(self, message: str):
+        """
+        Add a log message
+        Args:
+            message: Log message to add
+        """
+        if message not in self.logs:  # Éviter les doublons
+            self.logs.append(message)
+            # Forcer une copie pour éviter les problèmes de référence
+            self.logs = self.logs.copy()
+            print(f"[{self.session_uid}] {message}")
+    def get_logs(self):
+        """
+        Get all logs
+        Returns:
+            List of log messages
+        """
+        return self.logs.copy()
+    def is_task_completed(self):
+        """
+        Check if the task is completed
+        Returns:
+            True if completed, False otherwise
+        """
+        return self.is_completed
+    def run(self, file_path: str):
+        """
+        Run the benchmark process
+        Args:
+            file_path: Path to the uploaded file
+        """
+        # Démarrer dans un thread séparé pour ne pas bloquer
+        import threading
+        thread = threading.Thread(target=self._run_process, args=(file_path,))
+        thread.daemon = True
+        thread.start()
+    def _run_process(self, file_path: str):
+        """
+        Internal method to run the process
+        Args:
+            file_path: Path to the uploaded file
+        """
+        try:
+            # Étape 1: Configuration
+            self._add_log("[INFO] Starting configuration process")
+            self.config_task = CreateBenchConfigTask(session_uid=self.session_uid)
+            # Exécuter la tâche de configuration
+            config_path = self.config_task.run(file_path=file_path)
+            # Récupérer les logs de configuration
+            config_logs = self.config_task.get_logs()
+            for log in config_logs:
+                self._add_log(log)
+            # Marquer l'étape de configuration comme terminée
+            if "[SUCCESS] Stage completed: config_generation" not in self.logs:
+                self._add_log("[SUCCESS] Stage completed: configuration")
+            # Étape 2: Benchmark
+            self._add_log("[INFO] Starting benchmark process")
+            self.bench_task = CreateBenchTask(session_uid=self.session_uid, config_path=config_path)
+            # Exécuter la tâche de benchmark
+            self.bench_task.run()
+            # Attendre que la tâche de benchmark soit terminée
+            while not self.bench_task.is_task_completed():
+                # Récupérer les nouveaux logs et les ajouter
+                bench_logs = self.bench_task.get_logs()
+                for log in bench_logs:
+                    self._add_log(log)
+                time.sleep(1)
+            # Récupérer les logs finaux
+            final_logs = self.bench_task.get_logs()
+            for log in final_logs:
+                self._add_log(log)
+            # Marquer comme terminé
+            self.is_completed = True
+            self._add_log("[SUCCESS] Benchmark process completed successfully")
+        except Exception as e:
+            self._add_log(f"[ERROR] Benchmark process failed: {str(e)}")
+            self.is_completed = True

backend/tasks/create_bench.py CHANGED Viewed

@@ -132,7 +132,10 @@ class CreateBenchTask:
                     else:
                         # Detect completed stages
                         if "Completed stage:" in line:
-                            stage = line.split("'")[1] if "'" in line else line
                             self._add_log(f"[SUCCESS] Stage completed: {stage}")
                         else:
                             self._add_log(f"[INFO] {line}")
@@ -141,9 +144,9 @@ class CreateBenchTask:
             if self.process:
                 exit_code = self.process.poll()
                 if exit_code == 0:
-                    self._add_log("[SUCCESS] Ingestion process completed successfully")
                 else:
-                    self._add_log(f"[ERROR] Ingestion process terminated with error code: {exit_code}")
         except Exception as e:
             self._add_log(f"[ERROR] Error during output capture: {str(e)}")
         finally:
@@ -151,6 +154,61 @@ class CreateBenchTask:
             self.is_running_flag.clear()
             self._add_log("[INFO] Output capture completed")
     def run(self, token: Optional[str] = None) -> None:
         """
         Run the ingestion task

                     else:
                         # Detect completed stages
                         if "Completed stage:" in line:
+                            # Extraire le nom de l'étape
+                            stage = line.split("'")[1] if "'" in line else line.split("Completed stage:")[1].strip()
+                            # Standardiser les noms d'étapes pour correspondre au frontend
+                            stage = self._standardize_stage_name(stage)
                             self._add_log(f"[SUCCESS] Stage completed: {stage}")
                         else:
                             self._add_log(f"[INFO] {line}")
             if self.process:
                 exit_code = self.process.poll()
                 if exit_code == 0:
+                    self._add_log("[SUCCESS] Benchmark process completed successfully")
                 else:
+                    self._add_log(f"[ERROR] Benchmark process terminated with error code: {exit_code}")
         except Exception as e:
             self._add_log(f"[ERROR] Error during output capture: {str(e)}")
         finally:
             self.is_running_flag.clear()
             self._add_log("[INFO] Output capture completed")
+    def _standardize_stage_name(self, stage_name: str) -> str:
+        """
+        Standardize the stage name to match the frontend expectations
+        Args:
+            stage_name: Original stage name
+        Returns:
+            Standardized stage name
+        """
+        # Table de correspondance pour les noms d'étapes
+        stage_mapping = {
+            # Ajouter ici les correspondances nécessaires
+            # exemple: "original_name": "standardized_name"
+            "ingest": "ingestion",
+            "upload": "upload_ingest_to_hub",
+            "summarize": "summarization",
+            "chunk": "chunking",
+            "generate_questions": "single_shot_question_generation",
+        }
+        # Chercher des correspondances partielles
+        for key, value in stage_mapping.items():
+            if key in stage_name.lower():
+                return value
+        # Si aucune correspondance n'est trouvée, renvoyer le nom d'origine
+        return stage_name
+    def _simulate_ingestion_process(self) -> None:
+        """
+        Simulate the ingestion process for development mode
+        """
+        self._add_log("[INFO] Simulating ingestion process")
+        # Simuler les étapes avec les mêmes noms que ceux attendus par le frontend
+        steps = [
+            ("ingestion", 2),
+            ("upload_ingest_to_hub", 3),
+            ("summarization", 2),
+            ("chunking", 3),
+            ("single_shot_question_generation", 4)
+        ]
+        for step, delay in steps:
+            # Ajouter un message de début d'étape
+            self._add_log(f"[INFO] Processing {step}...")
+            time.sleep(delay)  # Simuler un délai
+            # Marquer l'étape comme terminée
+            self._add_log(f"[SUCCESS] Stage completed: {step}")
+        # Marquer la tâche comme terminée
+        self.is_completed = True
+        self._add_log("[SUCCESS] Benchmark process completed successfully")
     def run(self, token: Optional[str] = None) -> None:
         """
         Run the ingestion task

frontend/src/components/BenchmarkCreateForm.jsx CHANGED Viewed

@@ -188,7 +188,7 @@ function BenchmarkCreateForm({ onStartGeneration }) {
         align="center"
         sx={{ mb: 2, color: "text.secondary" }}
       >
-        Choose a sample document
       </Typography>
       <Grid container spacing={2} sx={{ mb: 2 }}>

         align="center"
         sx={{ mb: 2, color: "text.secondary" }}
       >
+        To create a benchmark, choose a sample document
       </Typography>
       <Grid container spacing={2} sx={{ mb: 2 }}>

frontend/src/components/BenchmarkGenerator.jsx CHANGED Viewed

@@ -11,30 +11,30 @@ const SIMULATION_DURATION = 20000; // 20 secondes
 // Define all benchmark steps in sequence
 const BENCHMARK_STEPS = [
   "ingestion",
   "upload_ingest_to_hub",
   "summarization",
   "chunking",
   "single_shot_question_generation",
-  "multi_hop_question_generation",
-  "lighteval",
 ];
 // Step labels for display (more user-friendly names)
 const STEP_LABELS = {
   ingestion: "Ingestion",
   upload_ingest_to_hub: "Upload to Hub",
   summarization: "Summarization",
   chunking: "Chunking",
   single_shot_question_generation: "Question generation",
-  multi_hop_question_generation: "Question generation",
-  lighteval: "Saving results",
 };
 // Simulated log messages for pre-calculated documents
 const SIMULATED_LOGS = [
   "[INFO] Initializing benchmark generation...",
   "[INFO] Generating base configuration file...",
   "[SUCCESS] Stage completed: ingestion",
   "[INFO] Processing document content for upload...",
   "[SUCCESS] Stage completed: upload_ingest_to_hub",
@@ -44,11 +44,7 @@ const SIMULATED_LOGS = [
   "[SUCCESS] Stage completed: chunking",
   "[INFO] Generating single-shot questions...",
   "[SUCCESS] Stage completed: single_shot_question_generation",
-  "[INFO] Creating multi-hop questions from content...",
-  "[SUCCESS] Stage completed: multi_hop_question_generation",
-  "[INFO] Running LightEval for benchmark validation...",
-  "[SUCCESS] Stage completed: lighteval",
-  "[SUCCESS] Ingestion process completed successfully",
 ];
 /**
@@ -70,7 +66,7 @@ const BenchmarkGenerator = ({ sessionId, isDefaultDocument, onComplete }) => {
   const [error, setError] = useState(null);
   const [currentPhase, setCurrentPhase] = useState("initializing");
   const [completedSteps, setCompletedSteps] = useState([]);
-  const [activeStep, setActiveStep] = useState(0);
   const [elapsedTime, setElapsedTime] = useState(0);
   // Reference to keep track of the polling interval
@@ -187,7 +183,7 @@ const BenchmarkGenerator = ({ sessionId, isDefaultDocument, onComplete }) => {
     setError(null);
     setCurrentPhase("initializing");
     setCompletedSteps([]);
-    setActiveStep(0);
     // Timing variables for simulation
     const totalSteps = SIMULATED_LOGS.length;
@@ -248,24 +244,37 @@ const BenchmarkGenerator = ({ sessionId, isDefaultDocument, onComplete }) => {
     });
     // Déterminer l'étape active basée sur les étapes complétées
-    let newActiveStep = 0;
     if (newCompletedSteps.length > 0) {
       // Trouver l'étape la plus avancée dans les logs
       const maxCompletedStepIndex = Math.max(
         ...newCompletedSteps.map((step) => BENCHMARK_STEPS.indexOf(step))
       );
-      newActiveStep = maxCompletedStepIndex + 1;
       // S'assurer que l'activeStep ne dépasse pas le nombre total d'étapes
       if (newActiveStep >= BENCHMARK_STEPS.length) {
         newActiveStep = BENCHMARK_STEPS.length;
       }
     }
     // Mettre à jour l'état si les étapes ont changé
-    // Comparer les tableaux avec JSON.stringify pour une comparaison profonde
     if (JSON.stringify(newCompletedSteps) !== JSON.stringify(completedSteps)) {
       setCompletedSteps(newCompletedSteps);
       setActiveStep(newActiveStep);
     }
@@ -278,14 +287,12 @@ const BenchmarkGenerator = ({ sessionId, isDefaultDocument, onComplete }) => {
     // Detect completion conditions
     const isComplete =
       recentLogs.some((log) =>
-        log.includes("[SUCCESS] Ingestion process completed successfully")
       ) ||
       recentLogs.some((log) =>
-        log.includes(
-          "[SUCCESS] Configuration and ingestion completed successfully"
-        )
       ) ||
-      newCompletedSteps.includes("lighteval") ||
       newActiveStep >= BENCHMARK_STEPS.length;
     if (isComplete) {
@@ -305,7 +312,7 @@ const BenchmarkGenerator = ({ sessionId, isDefaultDocument, onComplete }) => {
         });
       }
     } else if (
-      recentLogs.some((log) => log.includes("starting benchmark creation"))
     ) {
       setCurrentPhase("benchmarking");
     } else if (
@@ -333,7 +340,7 @@ const BenchmarkGenerator = ({ sessionId, isDefaultDocument, onComplete }) => {
     setError(null);
     setCurrentPhase("initializing");
     setCompletedSteps([]);
-    setActiveStep(0);
     try {
       // Call the API to generate the benchmark
@@ -355,87 +362,43 @@ const BenchmarkGenerator = ({ sessionId, isDefaultDocument, onComplete }) => {
       if (response.ok) {
         setGenerationLogs(result.logs || []);
-        // D'abord, on commence par interroger les logs de configuration
-        const pollConfigLogs = async () => {
           try {
-            // Call the API to get the config logs
-            const configLogsResponse = await fetch(
-              `${API_CONFIG.BASE_URL}/config-logs/${sessionId}`
             );
-            if (configLogsResponse.ok) {
-              const configLogsResult = await configLogsResponse.json();
-              // Update logs if there are new ones
               if (
-                configLogsResult.logs &&
-                configLogsResult.logs.length > generationLogs.length
               ) {
-                setGenerationLogs(configLogsResult.logs);
               }
-              // If config task is completed, switch to polling benchmark logs
-              if (configLogsResult.is_completed) {
-                // Attendre un court instant pour permettre au serveur de démarrer le benchmark
-                setTimeout(() => {
-                  console.log(
-                    "Configuration completed, switching to benchmark polling"
-                  );
-                  clearInterval(configPollingIntervalRef.current);
-                  pollBenchmarkLogs();
-                }, 1000);
               }
             }
           } catch (error) {
-            console.log("Error polling for config logs:", error);
-            // Don't stop polling on network errors
           }
-        };
-        // Fonction pour interroger les logs du benchmark
-        const pollBenchmarkLogs = async () => {
-          // Set up polling for benchmark logs
-          pollingIntervalRef.current = setInterval(async () => {
-            // Check if we already completed
-            if (generationComplete) {
-              clearInterval(pollingIntervalRef.current);
-              return;
-            }
-            try {
-              // Call the API to get the latest benchmark logs
-              const logsResponse = await fetch(
-                `${API_CONFIG.BASE_URL}/benchmark-logs/${sessionId}`
-              );
-              if (logsResponse.ok) {
-                const logsResult = await logsResponse.json();
-                // Update logs if there are new ones
-                if (
-                  logsResult.logs &&
-                  logsResult.logs.length > generationLogs.length
-                ) {
-                  setGenerationLogs(logsResult.logs);
-                }
-                // Check if the task is completed
-                if (logsResult.is_completed) {
-                  setGenerationComplete(true);
-                  clearInterval(pollingIntervalRef.current);
-                  // Notification is now handled in the useEffect above
-                }
-              }
-            } catch (error) {
-              console.log("Error polling for benchmark logs:", error);
-              // Don't stop polling on network errors
-            }
-          }, 3000); // Poll every 3 seconds
-        };
-        // Démarrer le polling des logs de configuration
-        const configPollingIntervalRef = { current: null };
-        configPollingIntervalRef.current = setInterval(pollConfigLogs, 1000); // Poll config logs more frequently (every second)
       } else {
         // Handle error
         setGenerationLogs([`Error: ${result.error || "Unknown error"}`]);
@@ -472,8 +435,8 @@ const BenchmarkGenerator = ({ sessionId, isDefaultDocument, onComplete }) => {
     const currentStepIndex = activeStep;
     // If there's no active step yet
-    if (currentStepIndex === 0 && completedSteps.length === 0) {
-      return `Starting (0/${totalSteps})`;
     }
     // If all steps are completed

 // Define all benchmark steps in sequence
 const BENCHMARK_STEPS = [
+  "configuration",
   "ingestion",
   "upload_ingest_to_hub",
   "summarization",
   "chunking",
   "single_shot_question_generation",
 ];
 // Step labels for display (more user-friendly names)
 const STEP_LABELS = {
+  configuration: "Configuration",
   ingestion: "Ingestion",
   upload_ingest_to_hub: "Upload to Hub",
   summarization: "Summarization",
   chunking: "Chunking",
   single_shot_question_generation: "Question generation",
 };
 // Simulated log messages for pre-calculated documents
 const SIMULATED_LOGS = [
   "[INFO] Initializing benchmark generation...",
   "[INFO] Generating base configuration file...",
+  "[SUCCESS] Stage completed: configuration",
+  "[INFO] Starting ingestion process...",
   "[SUCCESS] Stage completed: ingestion",
   "[INFO] Processing document content for upload...",
   "[SUCCESS] Stage completed: upload_ingest_to_hub",
   "[SUCCESS] Stage completed: chunking",
   "[INFO] Generating single-shot questions...",
   "[SUCCESS] Stage completed: single_shot_question_generation",
+  "[SUCCESS] Benchmark process completed successfully",
 ];
 /**
   const [error, setError] = useState(null);
   const [currentPhase, setCurrentPhase] = useState("initializing");
   const [completedSteps, setCompletedSteps] = useState([]);
+  const [activeStep, setActiveStep] = useState(1);
   const [elapsedTime, setElapsedTime] = useState(0);
   // Reference to keep track of the polling interval
     setError(null);
     setCurrentPhase("initializing");
     setCompletedSteps([]);
+    setActiveStep(1);
     // Timing variables for simulation
     const totalSteps = SIMULATED_LOGS.length;
     });
     // Déterminer l'étape active basée sur les étapes complétées
+    let newActiveStep = activeStep;
     if (newCompletedSteps.length > 0) {
       // Trouver l'étape la plus avancée dans les logs
       const maxCompletedStepIndex = Math.max(
         ...newCompletedSteps.map((step) => BENCHMARK_STEPS.indexOf(step))
       );
+      // Passer à l'étape suivante
+      const calculatedStep = maxCompletedStepIndex + 1;
+      // Ne mettre à jour que si la nouvelle étape est plus avancée que l'étape actuelle
+      if (calculatedStep > activeStep) {
+        newActiveStep = calculatedStep;
+      }
       // S'assurer que l'activeStep ne dépasse pas le nombre total d'étapes
       if (newActiveStep >= BENCHMARK_STEPS.length) {
         newActiveStep = BENCHMARK_STEPS.length;
       }
+    } else if (activeStep === 0) {
+      // Si aucune étape n'est trouvée et l'étape active est 0, passer à 1
+      newActiveStep = 1;
     }
     // Mettre à jour l'état si les étapes ont changé
     if (JSON.stringify(newCompletedSteps) !== JSON.stringify(completedSteps)) {
       setCompletedSteps(newCompletedSteps);
+    }
+    // Mettre à jour l'étape active seulement si elle a changé
+    if (newActiveStep !== activeStep) {
       setActiveStep(newActiveStep);
     }
     // Detect completion conditions
     const isComplete =
       recentLogs.some((log) =>
+        log.includes("[SUCCESS] Benchmark process completed successfully")
       ) ||
       recentLogs.some((log) =>
+        log.includes("[SUCCESS] Ingestion process completed successfully")
       ) ||
+      newCompletedSteps.includes("single_shot_question_generation") ||
       newActiveStep >= BENCHMARK_STEPS.length;
     if (isComplete) {
         });
       }
     } else if (
+      recentLogs.some((log) => log.includes("Starting ingestion process"))
     ) {
       setCurrentPhase("benchmarking");
     } else if (
     setError(null);
     setCurrentPhase("initializing");
     setCompletedSteps([]);
+    setActiveStep(1);
     try {
       // Call the API to generate the benchmark
       if (response.ok) {
         setGenerationLogs(result.logs || []);
+        // Configurer le polling pour suivre la progression
+        pollingIntervalRef.current = setInterval(async () => {
+          // Vérifier si on a déjà terminé
+          if (generationComplete) {
+            clearInterval(pollingIntervalRef.current);
+            return;
+          }
           try {
+            // Appeler l'API pour obtenir les derniers logs
+            const logsResponse = await fetch(
+              `${API_CONFIG.BASE_URL}/benchmark-progress/${sessionId}`
             );
+            if (logsResponse.ok) {
+              const logsResult = await logsResponse.json();
+              // Mettre à jour les logs s'il y en a de nouveaux
               if (
+                logsResult.logs &&
+                logsResult.logs.length > generationLogs.length
               ) {
+                setGenerationLogs(logsResult.logs);
               }
+              // Vérifier si la tâche est terminée
+              if (logsResult.is_completed) {
+                setGenerationComplete(true);
+                clearInterval(pollingIntervalRef.current);
+                // La notification est maintenant gérée dans le useEffect ci-dessus
               }
             }
           } catch (error) {
+            console.log("Error polling for logs:", error);
+            // Ne pas arrêter le polling en cas d'erreurs réseau
           }
+        }, 2000); // Interroger toutes les 2 secondes
       } else {
         // Handle error
         setGenerationLogs([`Error: ${result.error || "Unknown error"}`]);
     const currentStepIndex = activeStep;
     // If there's no active step yet
+    if (currentStepIndex <= 1 && completedSteps.length === 0) {
+      return `Starting (1/${totalSteps})`;
     }
     // If all steps are completed

frontend/src/components/EvaluationDisplay.jsx CHANGED Viewed

@@ -272,8 +272,8 @@ const EvaluationDisplay = ({ sessionId }) => {
                           alignItems: "center",
                         }}
                       >
-                        {model.model_name.length > 20
-                          ? `${model.model_name.substring(0, 20)}...`
                           : model.model_name}
                         <OpenInNewIcon sx={{ ml: 0.5, fontSize: 16 }} />
                       </Link>

                           alignItems: "center",
                         }}
                       >
+                        {model.model_name.length > 40
+                          ? `${model.model_name.substring(0, 40)}...`
                           : model.model_name}
                         <OpenInNewIcon sx={{ ml: 0.5, fontSize: 16 }} />
                       </Link>