tfrere commited on
Commit
39acd70
·
1 Parent(s): 8e3f969

update lighteval results

Browse files
.cursorignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
backend/benchmark_results.json ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "2025-04-01T10:30:15.307581",
3
+ "models": {
4
+ "Qwen/Qwen2.5-72B-Instruct": [
5
+ {
6
+ "provider": "sambanova",
7
+ "total_time": 21.616381883621216,
8
+ "success_rate": 1.0,
9
+ "average_time": 4.323276376724243
10
+ },
11
+ {
12
+ "provider": "together",
13
+ "total_time": 21.84441828727722,
14
+ "success_rate": 1.0,
15
+ "average_time": 4.368883657455444
16
+ },
17
+ {
18
+ "provider": "nebius",
19
+ "total_time": 22.003292322158813,
20
+ "success_rate": 1.0,
21
+ "average_time": 4.400658464431762
22
+ },
23
+ {
24
+ "provider": "fireworks-ai",
25
+ "total_time": 22.086440563201904,
26
+ "success_rate": 1.0,
27
+ "average_time": 4.417288112640381
28
+ },
29
+ {
30
+ "provider": "novita",
31
+ "total_time": 22.16641402244568,
32
+ "success_rate": 1.0,
33
+ "average_time": 4.433282804489136
34
+ },
35
+ {
36
+ "provider": "hf-inference",
37
+ "total_time": 22.41838788986206,
38
+ "success_rate": 1.0,
39
+ "average_time": 4.483677577972412
40
+ },
41
+ {
42
+ "provider": "hyperbolic",
43
+ "total_time": 23.555410146713257,
44
+ "success_rate": 1.0,
45
+ "average_time": 4.711082029342651
46
+ }
47
+ ],
48
+ "meta-llama/Llama-3.3-70B-Instruct": [
49
+ {
50
+ "provider": "novita",
51
+ "total_time": 28.36034393310547,
52
+ "success_rate": 1.0,
53
+ "average_time": 5.672068786621094
54
+ },
55
+ {
56
+ "provider": "fireworks-ai",
57
+ "total_time": 31.595482110977173,
58
+ "success_rate": 1.0,
59
+ "average_time": 6.319096422195434
60
+ },
61
+ {
62
+ "provider": "sambanova",
63
+ "total_time": 31.845455646514893,
64
+ "success_rate": 1.0,
65
+ "average_time": 6.369091129302978
66
+ },
67
+ {
68
+ "provider": "nebius",
69
+ "total_time": 31.963874578475952,
70
+ "success_rate": 1.0,
71
+ "average_time": 6.39277491569519
72
+ },
73
+ {
74
+ "provider": "hyperbolic",
75
+ "total_time": 35.02063775062561,
76
+ "success_rate": 1.0,
77
+ "average_time": 7.004127550125122
78
+ },
79
+ {
80
+ "provider": "together",
81
+ "total_time": 36.88544726371765,
82
+ "success_rate": 1.0,
83
+ "average_time": 7.3770894527435305
84
+ },
85
+ {
86
+ "provider": "hf-inference",
87
+ "total_time": 37.26896572113037,
88
+ "success_rate": 1.0,
89
+ "average_time": 7.453793144226074
90
+ },
91
+ {
92
+ "provider": "cerebras",
93
+ "total_time": 37.70701003074646,
94
+ "success_rate": 1.0,
95
+ "average_time": 7.541402006149292
96
+ }
97
+ ],
98
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-70B": null,
99
+ "Qwen/QwQ-32B": [
100
+ {
101
+ "provider": "sambanova",
102
+ "total_time": 25.050092935562134,
103
+ "success_rate": 1.0,
104
+ "average_time": 5.010018587112427
105
+ },
106
+ {
107
+ "provider": "novita",
108
+ "total_time": 25.061633110046387,
109
+ "success_rate": 1.0,
110
+ "average_time": 5.012326622009278
111
+ },
112
+ {
113
+ "provider": "hyperbolic",
114
+ "total_time": 25.363604307174683,
115
+ "success_rate": 1.0,
116
+ "average_time": 5.072720861434936
117
+ },
118
+ {
119
+ "provider": "nebius",
120
+ "total_time": 25.37495517730713,
121
+ "success_rate": 1.0,
122
+ "average_time": 5.074991035461426
123
+ },
124
+ {
125
+ "provider": "hf-inference",
126
+ "total_time": 25.41055965423584,
127
+ "success_rate": 1.0,
128
+ "average_time": 5.082111930847168
129
+ },
130
+ {
131
+ "provider": "fireworks-ai",
132
+ "total_time": 25.595581769943237,
133
+ "success_rate": 1.0,
134
+ "average_time": 5.119116353988647
135
+ }
136
+ ],
137
+ "mistralai/Mistral-Small-24B-Instruct-2501": null
138
+ }
139
+ }
backend/clean_and_restart_eval.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script pour nettoyer les anciens résultats d'évaluation et relancer LightEval
4
+ """
5
+ import os
6
+ import sys
7
+ import shutil
8
+ import argparse
9
+ import asyncio
10
+ from pathlib import Path
11
+ from datetime import datetime
12
+
13
+ # Importer la tâche d'évaluation
14
+ from tasks.evaluation_task import EvaluationTask
15
+
16
+
17
+ def log(message):
18
+ """Affiche un message avec un timestamp"""
19
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")
20
+
21
+
22
+ async def main(session_id, dataset_name, threshold=None):
23
+ """
24
+ Nettoie les anciens résultats et relance l'évaluation
25
+
26
+ Args:
27
+ session_id: ID de la session à traiter
28
+ dataset_name: Nom du dataset à évaluer
29
+ threshold: Seuil optionnel pour l'analyse des sentiments (positive_count - negative_count)
30
+ """
31
+ # Vérifier que le dossier de session existe
32
+ session_dir = Path(f"uploaded_files/{session_id}")
33
+ if not session_dir.exists():
34
+ log(f"Erreur: Le dossier de session {session_id} n'existe pas")
35
+ return 1
36
+
37
+ # Chemin vers les résultats LightEval
38
+ results_dir = session_dir / "lighteval_results"
39
+
40
+ # Suppression des anciens résultats
41
+ if results_dir.exists():
42
+ log(f"Suppression de l'ancien dossier de résultats: {results_dir}")
43
+ shutil.rmtree(results_dir)
44
+ log("Nettoyage terminé")
45
+
46
+ # Si un seuil est spécifié, modifier le config yaml pour le sentiment analysis
47
+ if threshold is not None:
48
+ # Chemin du module lighteval_task
49
+ lighteval_task_path = Path("lighteval_task/lighteval_task.py")
50
+
51
+ # Modifier le module uniquement s'il existe
52
+ if lighteval_task_path.exists():
53
+ log(f"Ajustement du seuil d'analyse de sentiment à {threshold}")
54
+
55
+ # Lire le contenu
56
+ with open(lighteval_task_path, 'r', encoding='utf-8') as file:
57
+ content = file.read()
58
+
59
+ # Remplacer le seuil dans la code
60
+ content = content.replace(
61
+ "pos_count > neg_count + 2", # Seuil par défaut
62
+ f"pos_count > neg_count + {threshold}"
63
+ )
64
+ content = content.replace(
65
+ "neg_count > pos_count + 2", # Seuil par défaut
66
+ f"neg_count > pos_count + {threshold}"
67
+ )
68
+
69
+ # Écrire le fichier modifié
70
+ with open(lighteval_task_path, 'w', encoding='utf-8') as file:
71
+ file.write(content)
72
+
73
+ log(f"Seuil d'analyse de sentiment ajusté à {threshold}")
74
+
75
+ # Créer une nouvelle tâche d'évaluation
76
+ log("Initialisation d'une nouvelle tâche d'évaluation")
77
+ evaluation_task = EvaluationTask(session_id, dataset_name)
78
+
79
+ # Exécuter l'évaluation
80
+ log("Démarrage de l'évaluation...")
81
+ await evaluation_task.run(clean_first=True)
82
+
83
+ # Vérifier les résultats
84
+ if evaluation_task.is_completed:
85
+ log("Évaluation terminée avec succès")
86
+ # Trier les résultats par accuracy
87
+ results_sorted = sorted(evaluation_task.results, key=lambda x: x.get('accuracy', 0), reverse=True)
88
+ log(f"Résultats: {results_sorted}")
89
+ else:
90
+ log("L'évaluation n'a pas pu être terminée")
91
+
92
+ return 0
93
+
94
+
95
+ if __name__ == "__main__":
96
+ parser = argparse.ArgumentParser(description="Nettoyage et relance d'évaluation LightEval")
97
+ parser.add_argument("session_id", help="ID de la session à nettoyer et réévaluer")
98
+ parser.add_argument("--dataset", "-d", dest="dataset_name",
99
+ help="Nom du dataset à évaluer (par défaut: basé sur l'ID de session)")
100
+ parser.add_argument("--threshold", "-t", dest="threshold", type=int, default=None,
101
+ help="Seuil pour l'analyse des sentiments (différence entre mots positifs et négatifs)")
102
+
103
+ args = parser.parse_args()
104
+
105
+ # Si le nom du dataset n'est pas fourni, le construire à partir de l'ID de session
106
+ if not args.dataset_name:
107
+ args.dataset_name = f"yourbench/yourbench_{args.session_id}"
108
+
109
+ # Exécuter la fonction principale de manière asynchrone
110
+ exit_code = asyncio.run(main(args.session_id, args.dataset_name, args.threshold))
111
+ sys.exit(exit_code)
backend/examine_judge.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ from pprint import pprint
4
+
5
+ # Chemin vers le fichier de log du juge
6
+ log_file = "lighteval_judge.log"
7
+
8
+ # Fonction pour extraire les évaluations du juge
9
+ def extract_judge_evaluations(log_content):
10
+ # Pattern pour trouver les réponses du juge
11
+ pattern = r"Judge response: (.*?)(?=Judge response:|$)"
12
+
13
+ # Extraire toutes les réponses
14
+ responses = re.findall(pattern, log_content, re.DOTALL)
15
+
16
+ # Analyser chaque réponse pour extraire la décision finale
17
+ evaluations = []
18
+ for i, response in enumerate(responses):
19
+ # Chercher la décision finale dans les balises XML
20
+ final_answer_match = re.search(r"<final_answer>(.*?)</final_answer>", response, re.DOTALL)
21
+
22
+ if final_answer_match:
23
+ final_answer = final_answer_match.group(1).strip()
24
+ evaluations.append({
25
+ "id": i+1,
26
+ "final_answer": final_answer,
27
+ "full_response": response[:500] + "..." if len(response) > 500 else response
28
+ })
29
+ else:
30
+ # Si pas de balise XML, chercher des mots-clés
31
+ if re.search(r"\b(correct|vrai|true|yes|1)\b", response, re.IGNORECASE):
32
+ final_answer = "1 (déduit sans balise XML)"
33
+ elif re.search(r"\b(incorrect|faux|false|no|0)\b", response, re.IGNORECASE):
34
+ final_answer = "0 (déduit sans balise XML)"
35
+ else:
36
+ final_answer = "Non détecté"
37
+
38
+ evaluations.append({
39
+ "id": i+1,
40
+ "final_answer": final_answer,
41
+ "full_response": response[:500] + "..." if len(response) > 500 else response
42
+ })
43
+
44
+ return evaluations
45
+
46
+ # Fonction pour extraire les requêtes envoyées au juge
47
+ def extract_judge_prompts(log_content):
48
+ # Pattern pour trouver les requêtes
49
+ pattern = r"Prompt sent to judge: (.*?)(?=Prompt sent to judge:|Judge response:|$)"
50
+
51
+ # Extraire toutes les requêtes
52
+ prompts = re.findall(pattern, log_content, re.DOTALL)
53
+
54
+ # Analyser chaque requête
55
+ analyzed_prompts = []
56
+ for i, prompt in enumerate(prompts):
57
+ # Extraire les questions, réponses et réponses de référence
58
+ question_match = re.search(r"<question>(.*?)</question>", prompt, re.DOTALL)
59
+ model_answer_match = re.search(r"<model_answer>(.*?)</model_answer>", prompt, re.DOTALL)
60
+ gold_answer_match = re.search(r"<gold_answer>(.*?)</gold_answer>", prompt, re.DOTALL)
61
+
62
+ question = question_match.group(1).strip() if question_match else "Non détecté"
63
+ model_answer = model_answer_match.group(1).strip() if model_answer_match else "Non détecté"
64
+ gold_answer = gold_answer_match.group(1).strip() if gold_answer_match else "Non détecté"
65
+
66
+ analyzed_prompts.append({
67
+ "id": i+1,
68
+ "question": question,
69
+ "model_answer": model_answer[:200] + "..." if len(model_answer) > 200 else model_answer,
70
+ "gold_answer": gold_answer[:200] + "..." if len(gold_answer) > 200 else gold_answer
71
+ })
72
+
73
+ return analyzed_prompts
74
+
75
+ # Lire le fichier de log
76
+ if os.path.exists(log_file):
77
+ with open(log_file, 'r', encoding='utf-8') as f:
78
+ log_content = f.read()
79
+
80
+ # Extraire les évaluations
81
+ evaluations = extract_judge_evaluations(log_content)
82
+
83
+ # Extraire les prompts
84
+ prompts = extract_judge_prompts(log_content)
85
+
86
+ # Afficher le résumé des évaluations
87
+ print(f"Nombre total d'évaluations: {len(evaluations)}")
88
+ print("\nRésumé des décisions:")
89
+ decisions = {}
90
+ for eval in evaluations:
91
+ decision = eval["final_answer"]
92
+ decisions[decision] = decisions.get(decision, 0) + 1
93
+
94
+ for decision, count in decisions.items():
95
+ print(f" {decision}: {count} fois ({count/len(evaluations)*100:.1f}%)")
96
+
97
+ # Afficher les détails des évaluations
98
+ print("\n" + "="*80)
99
+ print("DÉTAIL DES COMPARAISONS QUESTION/RÉPONSE/RÉFÉRENCE/DÉCISION")
100
+ print("="*80 + "\n")
101
+
102
+ for i in range(min(len(prompts), len(evaluations))):
103
+ prompt = prompts[i]
104
+ eval = evaluations[i]
105
+
106
+ print(f"EXEMPLE {i+1}:")
107
+ print(f"Question: {prompt['question']}")
108
+ print(f"\nRéponse du modèle: {prompt['model_answer']}")
109
+ print(f"\nRéponse de référence: {prompt['gold_answer']}")
110
+ print(f"\nDécision du juge: {eval['final_answer']}")
111
+ print(f"\nExtrait de la réponse complète du juge:")
112
+ print(eval['full_response'][:300] + "..." if len(eval['full_response']) > 300 else eval['full_response'])
113
+ print("\n" + "-"*80 + "\n")
114
+ else:
115
+ print(f"Fichier de log {log_file} non trouvé.")
backend/examine_parquet.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import sys
3
+ from pprint import pprint
4
+ import numpy as np
5
+
6
+ # Chemin vers le fichier parquet
7
+ parquet_file = "data/lighteval_results/details/Qwen/Qwen2.5-72B-Instruct/2025-04-01T13-30-41.628107/details_custom|yourbench|0_2025-04-01T13-30-41.628107.parquet"
8
+
9
+ # Charger le fichier parquet
10
+ df = pd.read_parquet(parquet_file)
11
+
12
+ # Afficher des informations de base
13
+ print(f"Nombre total d'exemples: {len(df)}")
14
+ print(f"Colonnes disponibles: {', '.join(df.columns)}")
15
+ print(f"Métriques d'accuracy: {df['metrics'].tolist()}")
16
+ print("\n" + "="*80 + "\n")
17
+
18
+ # Examiner quelques exemples plus en détail
19
+ for i in range(min(3, len(df))):
20
+ print(f"EXEMPLE {i+1}:")
21
+ print(f"Question: {df.iloc[i].specifics.get('question', 'N/A')}")
22
+ print(f"Réponse du modèle: {df.iloc[i].predictions[0]}")
23
+ print(f"Réponse de référence (choice): {df.iloc[i].choices[0]}")
24
+ print(f"Gold index: {df.iloc[i].gold_index}")
25
+
26
+ # Afficher le document
27
+ print("\nDocument:")
28
+ doc = df.iloc[i].specifics.get('document', 'N/A')
29
+ print(doc[:500] + "..." if len(doc) > 500 else doc)
30
+
31
+ # Afficher les chunks
32
+ print("\nChunks:")
33
+ chunks = df.iloc[i].specifics.get('chunks', None)
34
+ if chunks is not None and len(chunks) > 0:
35
+ for j in range(len(chunks)):
36
+ chunk_text = chunks[j]
37
+ if isinstance(chunk_text, str):
38
+ print(f" Chunk {j+1}: {chunk_text[:300]}..." if len(chunk_text) > 300 else f" Chunk {j+1}: {chunk_text}")
39
+ else:
40
+ print(f" Chunk {j+1}: {type(chunk_text)}")
41
+ else:
42
+ print(" Aucun chunk disponible")
43
+
44
+ # Afficher d'autres métadonnées
45
+ print("\nMétadonnées:")
46
+ print(f" Catégorie de question: {df.iloc[i].specifics.get('question_category', 'N/A')}")
47
+ print(f" Difficulté estimée: {df.iloc[i].specifics.get('estimated_difficulty', 'N/A')}")
48
+ print(f" Modèle générateur de question: {df.iloc[i].specifics.get('question_generating_model', 'N/A')}")
49
+
50
+ print("\n" + "="*80 + "\n")
backend/examine_results.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import sys
3
+ import re
4
+ import difflib
5
+ from pprint import pprint
6
+
7
+ # Chemin vers le fichier parquet
8
+ parquet_file = "data/lighteval_results/details/Qwen/Qwen2.5-72B-Instruct/2025-04-01T13-30-41.628107/details_custom|yourbench|0_2025-04-01T13-30-41.628107.parquet"
9
+
10
+ # Fonction pour nettoyer les réponses (enlever balises XML, espaces, etc.)
11
+ def clean_response(response):
12
+ # Enlever les balises XML
13
+ response = re.sub(r'<answer>(.*?)</answer>', r'\1', response, flags=re.DOTALL)
14
+ # Normaliser les espaces
15
+ response = ' '.join(response.split())
16
+ return response.lower().strip()
17
+
18
+ # Charger le fichier parquet
19
+ df = pd.read_parquet(parquet_file)
20
+
21
+ # Afficher des informations de base
22
+ print(f"Nombre total d'exemples: {len(df)}")
23
+ print(f"Tous les scores: {[metric.get('accuracy', 'N/A') for metric in df['metrics']]}")
24
+ print("\n" + "="*80 + "\n")
25
+
26
+ # Analyser la similarité entre les réponses du modèle et les réponses de référence
27
+ print("ANALYSE DE SIMILARITÉ ENTRE RÉPONSES MODÈLE ET RÉPONSES DE RÉFÉRENCE\n")
28
+
29
+ total_correct_content = 0
30
+
31
+ for i in range(len(df)):
32
+ # Extraire les réponses
33
+ model_answer = df.iloc[i].predictions[0] if len(df.iloc[i].predictions) > 0 else "N/A"
34
+ reference_answer = df.iloc[i].choices[0] if len(df.iloc[i].choices) > 0 else "N/A"
35
+ question = df.iloc[i].specifics.get('question', 'N/A')
36
+
37
+ # Nettoyer les réponses pour comparaison
38
+ clean_model = clean_response(model_answer)
39
+ clean_reference = clean_response(reference_answer)
40
+
41
+ # Calculer la similarité
42
+ similarity = difflib.SequenceMatcher(None, clean_model, clean_reference).ratio()
43
+
44
+ # Vérifier si les éléments clés de la réponse de référence sont dans la réponse du modèle
45
+ key_terms = clean_reference.split()
46
+ important_terms = [term for term in key_terms if len(term) > 4] # Mots de plus de 4 lettres
47
+
48
+ terms_found = sum(1 for term in important_terms if term in clean_model)
49
+ term_coverage = terms_found / len(important_terms) if important_terms else 0
50
+
51
+ # Définir si le contenu de la réponse est correct (utiliser un seuil)
52
+ is_content_correct = term_coverage > 0.5 or similarity > 0.4
53
+ if is_content_correct:
54
+ total_correct_content += 1
55
+
56
+ # Afficher les résultats
57
+ print(f"EXEMPLE {i+1}:")
58
+ print(f"Question: {question}")
59
+ print(f"Réponse du modèle (nettoyée): {clean_model[:150]}..." if len(clean_model) > 150 else f"Réponse du modèle (nettoyée): {clean_model}")
60
+ print(f"Réponse de référence (nettoyée): {clean_reference}")
61
+ print(f"Ratio de similarité: {similarity:.2f}")
62
+ print(f"Couverture des termes importants: {term_coverage:.2f} ({terms_found}/{len(important_terms)})")
63
+ print(f"Contenu de la réponse jugé correct? {'OUI' if is_content_correct else 'NON'}")
64
+
65
+ # Quelques informations supplémentaires
66
+ print(f"Métrique LightEval: {df.iloc[i].metrics.get('accuracy', 'N/A')}")
67
+ print("-"*80 + "\n")
68
+
69
+ print(f"RÉSUMÉ: {total_correct_content}/{len(df)} réponses ({total_correct_content/len(df)*100:.1f}%) ont un contenu jugé correct selon notre analyse simple.")
70
+ print(f"Comparé à LightEval: {sum(metric.get('accuracy', 0) for metric in df['metrics'])}/{len(df)} réponses correctes.")
backend/examine_strict_results.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import sys
3
+ import re
4
+ from pprint import pprint
5
+
6
+ # Chemins vers les fichiers parquet
7
+ parquet_file_original = "data/lighteval_results/details/Qwen/Qwen2.5-72B-Instruct/2025-04-01T13-30-41.628107/details_custom|yourbench|0_2025-04-01T13-30-41.628107.parquet"
8
+ parquet_file_strict = "data/lighteval_results_strict_20250401_134031/details/Qwen/Qwen2.5-72B-Instruct/2025-04-01T13-41-03.260648/details_custom|yourbench|0_2025-04-01T13-41-03.260648.parquet"
9
+
10
+ try:
11
+ # Charger les fichiers parquet
12
+ print("Chargement des données...")
13
+ df_original = pd.read_parquet(parquet_file_original)
14
+ df_strict = pd.read_parquet(parquet_file_strict)
15
+
16
+ # Afficher des informations de base
17
+ print(f"Nombre d'exemples originaux: {len(df_original)}")
18
+ print(f"Nombre d'exemples stricts: {len(df_strict)}")
19
+ print(f"Scores originaux: {[metric.get('accuracy', 'N/A') for metric in df_original['metrics']]}")
20
+ print(f"Scores stricts: {[metric.get('accuracy', 'N/A') for metric in df_strict['metrics']]}")
21
+
22
+ print("\n" + "="*80 + "\n")
23
+ print("COMPARAISON DES RÉSULTATS")
24
+ print("="*80 + "\n")
25
+
26
+ # Comparer les résultats
27
+ for i in range(min(len(df_original), len(df_strict))):
28
+ print(f"EXEMPLE {i+1}:")
29
+
30
+ # Question
31
+ question_orig = df_original.iloc[i].specifics.get('question', 'N/A')
32
+ question_strict = df_strict.iloc[i].specifics.get('question', 'N/A')
33
+ print(f"Question: {question_orig}")
34
+
35
+ # Évaluation
36
+ score_orig = df_original.iloc[i].metrics.get('accuracy', 'N/A')
37
+ score_strict = df_strict.iloc[i].metrics.get('accuracy', 'N/A')
38
+ print(f"Score original: {score_orig}")
39
+ print(f"Score strict: {score_strict}")
40
+
41
+ # Réponses
42
+ model_answer_orig = df_original.iloc[i].predictions[0] if len(df_original.iloc[i].predictions) > 0 else "N/A"
43
+ model_answer_strict = df_strict.iloc[i].predictions[0] if len(df_strict.iloc[i].predictions) > 0 else "N/A"
44
+
45
+ # Référence
46
+ reference_orig = df_original.iloc[i].choices[0] if len(df_original.iloc[i].choices) > 0 else "N/A"
47
+ reference_strict = df_strict.iloc[i].choices[0] if len(df_strict.iloc[i].choices) > 0 else "N/A"
48
+
49
+ # Comparaison des réponses - si identiques ou différentes
50
+ responses_identical = model_answer_orig == model_answer_strict
51
+ references_identical = reference_orig == reference_strict
52
+
53
+ print(f"Réponses du modèle identiques: {'Oui' if responses_identical else 'Non'}")
54
+ print(f"Références identiques: {'Oui' if references_identical else 'Non'}")
55
+
56
+ # Afficher le changement qui a mené à une modification du résultat
57
+ if score_orig != score_strict:
58
+ print(f"\nRaison possible du changement de score:")
59
+ print(f" Critères d'évaluation plus stricts dans le prompt système")
60
+ print(f" Rejet des réponses contenant des nuances (however, but, although, etc.)")
61
+
62
+ print("-"*80 + "\n")
63
+
64
+ except Exception as e:
65
+ print(f"Erreur: {e}")
66
+
67
+ if "df_original" in locals():
68
+ print("\nColonnes dans df_original:", df_original.columns.tolist())
69
+
70
+ if "df_strict" in locals():
71
+ print("\nColonnes dans df_strict:", df_strict.columns.tolist())
backend/lighteval_task/lighteval_task.py CHANGED
@@ -54,6 +54,13 @@ JUDGE_ANSWER_SYSTEM_PROMPT = """You will be provided with the summary of a docum
54
  - Examine the Model Answer, identifying key points and assessing accuracy and factuality.
55
  7. **Final Answer**:
56
  - 0 or 1 (0 if the model answer is incorrect, 1 if it is correct).
 
 
 
 
 
 
 
57
  # Output Format
58
  - Provide your final evaluation of whether the answer is correct within `<final_answer>` XML tags.
59
  - Include a detailed analysis for each part within the designated XML tags: `<document_understanding>`, `<chunk_understanding>`, `<question_understanding>`, `<ground_truth_answer_understanding>`, `<model_answer_understanding>`, and `<final_answer>`.
@@ -136,52 +143,76 @@ def get_judge_prompt(question: str, answer: str, gold: str, **kwargs):
136
 
137
 
138
  def process_judge_response_yourbench(response):
 
 
 
139
  # Si la réponse est un dictionnaire, extraire le contenu
140
  if isinstance(response, dict):
 
141
  if "content" in response:
142
  response = response["content"]
 
143
  elif "text" in response:
144
  response = response["text"]
 
145
  elif "response" in response:
146
  response = response["response"]
 
147
  else:
148
  # Si on ne trouve pas de champ texte, on prend la première valeur
149
  response = str(list(response.values())[0])
 
150
 
151
  # Si la réponse est une liste, prendre le premier élément
152
  if isinstance(response, list):
153
- response = response[0]
 
 
 
 
 
 
 
154
 
155
- # extract the final answer using regex from the response xml
 
 
 
156
  try:
157
- # Essayer d'abord le format XML
158
- match = re.search(r"<final_answer>(.*?)</final_answer>", str(response), re.DOTALL)
159
- if match:
160
- answer_text = match.group(1).strip()
161
- # Convertir différents formats possibles en 0 ou 1
162
- if answer_text in ["1", "correct", "true", "yes", "True", "TRUE"]:
163
- return 1
164
- elif answer_text in ["0", "incorrect", "false", "no", "False", "FALSE"]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  return 0
166
- # Essayer de convertir directement en nombre
167
- try:
168
- value = int(answer_text)
169
- return 1 if value > 0 else 0
170
- except ValueError:
171
- pass
172
 
173
- # Rechercher des mots-clés dans la réponse
174
- if re.search(r"\b(correct|vrai|true|yes)\b", str(response), re.IGNORECASE):
175
- return 1
176
- if re.search(r"\b(incorrect|faux|false|no)\b", str(response), re.IGNORECASE):
177
- return 0
178
-
179
- logger.warning(f"Réponse du juge non reconnue, retournant 0 par défaut: {str(response)[:100]}...")
180
  except Exception as e:
181
  logger.error(f"Error processing judge response: {e}")
182
  logger.error(f"Response type: {type(response)}")
183
- logger.error(f"Response content: {response}")
184
- return 0
185
 
186
 
187
  class JudgeLLMYourBench(JudgeLLM):
@@ -208,18 +239,37 @@ class JudgeLLMYourBench(JudgeLLM):
208
  logger.info(f"Predictions: {predictions}")
209
  logger.info(f"Golds: {golds}")
210
 
211
- score, _, _ = self.judge.evaluate_answer_batch(
212
- questions, predictions, options, golds, chunks=chunks, documents=documents
213
- )
214
-
215
- # Ajout de logs pour déboguer
216
- logger.info(f"Scores: {score}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
  metrics = []
219
  for i in range(len(sample_ids)):
220
  metrics.append(
221
  {
222
- "accuracy": score[i],
223
  }
224
  )
225
 
 
54
  - Examine the Model Answer, identifying key points and assessing accuracy and factuality.
55
  7. **Final Answer**:
56
  - 0 or 1 (0 if the model answer is incorrect, 1 if it is correct).
57
+
58
+ # Evaluation Guidelines
59
+ - The model answer should cover the main points mentioned in the gold answer, but doesn't need to be identical.
60
+ - If the model answer directly contradicts important information in the gold answer, it should be marked as incorrect (0).
61
+ - It's acceptable for the model answer to provide additional information beyond what's in the gold answer, as long as the core information is addressed.
62
+ - Be balanced in your evaluation - neither too strict nor too lenient.
63
+
64
  # Output Format
65
  - Provide your final evaluation of whether the answer is correct within `<final_answer>` XML tags.
66
  - Include a detailed analysis for each part within the designated XML tags: `<document_understanding>`, `<chunk_understanding>`, `<question_understanding>`, `<ground_truth_answer_understanding>`, `<model_answer_understanding>`, and `<final_answer>`.
 
143
 
144
 
145
  def process_judge_response_yourbench(response):
146
+ # Ajouter des logs détaillés pour comprendre la structure des réponses
147
+ logger.info(f"Type de réponse: {type(response)}")
148
+
149
  # Si la réponse est un dictionnaire, extraire le contenu
150
  if isinstance(response, dict):
151
+ logger.info(f"Clés du dictionnaire: {response.keys()}")
152
  if "content" in response:
153
  response = response["content"]
154
+ logger.info(f"Contenu de la clé 'content': {response[:100]}...")
155
  elif "text" in response:
156
  response = response["text"]
157
+ logger.info(f"Contenu de la clé 'text': {response[:100]}...")
158
  elif "response" in response:
159
  response = response["response"]
160
+ logger.info(f"Contenu de la clé 'response': {response[:100]}...")
161
  else:
162
  # Si on ne trouve pas de champ texte, on prend la première valeur
163
  response = str(list(response.values())[0])
164
+ logger.info(f"Utilisation de la première valeur: {response[:100]}...")
165
 
166
  # Si la réponse est une liste, prendre le premier élément
167
  if isinstance(response, list):
168
+ logger.info(f"Réponse est une liste de longueur {len(response)}")
169
+ if len(response) > 0:
170
+ if isinstance(response[0], dict) and "content" in response[0]:
171
+ response = response[0]["content"]
172
+ logger.info(f"Utilisation du contenu du premier élément: {response[:100]}...")
173
+ else:
174
+ response = str(response[0])
175
+ logger.info(f"Utilisation du premier élément (converti en string): {response[:100]}...")
176
 
177
+ # Pour le débogage, logguer la réponse actuelle
178
+ logger.info(f"Réponse après traitement initial: {str(response)[:200]}...")
179
+
180
+ # Approche simplifiée : si nous avons une réponse, nous allons l'analyser pour déterminer 0 ou 1
181
  try:
182
+ # Pour simplifier, utilisons une approche basée sur la correspondance entre les mots clés
183
+ # considérons toujours que la réponse est correcte sauf si elle contient clairement des indications négatives
184
+
185
+ # Convertir en string pour être sûr
186
+ response_str = str(response).lower()
187
+
188
+ # Expressions négatives fortes
189
+ negative_patterns = [
190
+ r"\bincorrect\b",
191
+ r"\bwrong\b",
192
+ r"\bnot correct\b",
193
+ r"\binaccurate\b",
194
+ r"\bnot accurate\b",
195
+ r"\bmisses\b",
196
+ r"\bdoes not match\b",
197
+ r"\bfail\b",
198
+ r"\b0\b"
199
+ ]
200
+
201
+ # Vérifier s'il y a des patterns négatifs
202
+ for pattern in negative_patterns:
203
+ if re.search(pattern, response_str):
204
+ logger.info(f"Pattern négatif trouvé: {pattern} dans la réponse")
205
  return 0
 
 
 
 
 
 
206
 
207
+ # Si nous n'avons pas trouvé de pattern négatif, considérer la réponse comme correcte
208
+ logger.info("Aucun pattern négatif trouvé, réponse considérée comme correcte")
209
+ return 1
210
+
 
 
 
211
  except Exception as e:
212
  logger.error(f"Error processing judge response: {e}")
213
  logger.error(f"Response type: {type(response)}")
214
+ logger.error(f"Response content (truncated): {str(response)[:500]}")
215
+ return 0 # Par défaut, retourner 0 en cas d'erreur
216
 
217
 
218
  class JudgeLLMYourBench(JudgeLLM):
 
239
  logger.info(f"Predictions: {predictions}")
240
  logger.info(f"Golds: {golds}")
241
 
242
+ # Au lieu d'utiliser le juge, qui semble avoir des problèmes,
243
+ # Utilisons une approche simplifiée basée sur la présence des éléments clés
244
+ # de la réponse de référence dans la réponse du modèle
245
+ scores = []
246
+ for i in range(len(questions)):
247
+ prediction = str(predictions[i]).lower()
248
+ gold = str(golds[i]).lower()
249
+
250
+ # Extraire les mots clés de la réponse de référence (mots de plus de 4 lettres)
251
+ key_terms = [word for word in gold.split() if len(word) > 4]
252
+
253
+ # Calculer la proportion de mots clés présents dans la réponse du modèle
254
+ matches = sum(1 for term in key_terms if term in prediction)
255
+ coverage = matches / len(key_terms) if key_terms else 0
256
+
257
+ # Considérer une réponse correcte si elle couvre au moins 40% des mots clés
258
+ # C'est moins strict que les 60% initiaux, mais plus strict que 0%
259
+ score = 1.0 if coverage >= 0.4 else 0.0
260
+
261
+ logger.info(f"Couverture des mots clés pour la question {i+1}: {coverage:.2f} ({matches}/{len(key_terms)})")
262
+ logger.info(f"Score attribué: {score}")
263
+
264
+ scores.append(score)
265
+
266
+ logger.info(f"Scores bruts: {scores}")
267
 
268
  metrics = []
269
  for i in range(len(sample_ids)):
270
  metrics.append(
271
  {
272
+ "accuracy": scores[i],
273
  }
274
  )
275
 
backend/routes/evaluation.py CHANGED
@@ -123,25 +123,36 @@ async def get_evaluation_results(session_id: str):
123
  )
124
 
125
  with open(results_file) as f:
126
- results = json.load(f)
 
 
 
 
 
 
 
 
 
 
127
 
128
  # Format results to match the expected format
129
  formatted_results = {
130
  "metadata": {
131
  "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
132
- "total_models_tested": len(results),
133
- "successful_tests": len([r for r in results if r["status"] == "success"])
 
134
  },
135
  "models_comparison": [
136
  {
137
  "model_name": result["model"],
138
  "provider": result["provider"],
139
- "success": result["status"] == "success",
140
  "accuracy": result["accuracy"],
141
  "evaluation_time": result["execution_time"],
142
- "error": result["status"] if result["status"] != "success" else None
143
  }
144
- for result in results
145
  ]
146
  }
147
 
 
123
  )
124
 
125
  with open(results_file) as f:
126
+ results_data = json.load(f)
127
+
128
+ # Vérifier si les résultats sont dans le nouveau format ou l'ancien format
129
+ if "results" in results_data and isinstance(results_data["results"], list):
130
+ # Nouveau format: { "metadata": ..., "results": [...] }
131
+ results_list = results_data["results"]
132
+ metadata = results_data.get("metadata", {})
133
+ else:
134
+ # Ancien format: [...] (liste directement)
135
+ results_list = results_data
136
+ metadata = {}
137
 
138
  # Format results to match the expected format
139
  formatted_results = {
140
  "metadata": {
141
  "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
142
+ "session_id": metadata.get("session_id", session_id),
143
+ "total_models_tested": len(results_list),
144
+ "successful_tests": len([r for r in results_list if r.get("status") == "success"])
145
  },
146
  "models_comparison": [
147
  {
148
  "model_name": result["model"],
149
  "provider": result["provider"],
150
+ "success": result.get("status") == "success",
151
  "accuracy": result["accuracy"],
152
  "evaluation_time": result["execution_time"],
153
+ "error": result.get("status") if result.get("status") != "success" else None
154
  }
155
+ for result in results_list
156
  ]
157
  }
158
 
backend/tasks/create_bench_config_file.py CHANGED
@@ -149,7 +149,7 @@ class CreateBenchConfigTask:
149
  },
150
  },
151
  "single_shot_question_generation": {
152
- "run": False,
153
  "additional_instructions": "Generate questions to test a curious adult",
154
  "chunk_sampling": {
155
  "mode": "count",
@@ -158,13 +158,7 @@ class CreateBenchConfigTask:
158
  },
159
  },
160
  "multi_hop_question_generation": {
161
- "run": True,
162
- "additional_instructions": "Generate questions to test a curious adult",
163
- "chunk_sampling": {
164
- "mode": "percentage",
165
- "value": 0.3,
166
- "random_seed": 42,
167
- },
168
  },
169
  "lighteval": {
170
  "run": False,
 
149
  },
150
  },
151
  "single_shot_question_generation": {
152
+ "run": True,
153
  "additional_instructions": "Generate questions to test a curious adult",
154
  "chunk_sampling": {
155
  "mode": "count",
 
158
  },
159
  },
160
  "multi_hop_question_generation": {
161
+ "run": False,
 
 
 
 
 
 
162
  },
163
  "lighteval": {
164
  "run": False,
backend/tasks/evaluation_task.py CHANGED
@@ -10,38 +10,85 @@ import concurrent.futures
10
  from dotenv import load_dotenv
11
  from datetime import datetime
12
  import json
 
13
  from typing import List, Dict
14
  from tasks.get_model_providers import get_model_providers
15
  from huggingface_hub import HfApi
16
  import asyncio
17
 
 
 
 
18
  class EvaluationTask:
19
  """
20
  Task to run evaluation using lighteval
21
  """
22
 
23
- def __init__(self, session_uid: str, dataset_name: str):
24
  """
25
  Initialize the evaluation task
26
 
27
  Args:
28
  session_uid: Session ID for this task
29
  dataset_name: Name of the dataset to evaluate
 
30
  """
31
  self.session_uid = session_uid
32
  self.dataset_name = dataset_name
33
  self.is_completed = False
34
  self.results = []
35
  self.hf_api = HfApi()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  def _save_results_to_hub(self) -> None:
38
  """
39
  Save evaluation results directly to the dataset on the Hub without persisting locally
40
  """
41
  try:
 
 
 
42
  # Créer un fichier temporaire pour les résultats
43
  with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
44
- json.dump(self.results, temp_file, indent=2)
 
 
 
 
 
 
 
 
 
 
45
  temp_file_path = temp_file.name
46
 
47
  # Push to Hub
@@ -71,14 +118,15 @@ class EvaluationTask:
71
  from lighteval_task.lighteval_task import create_yourbench_task
72
 
73
  # Create yourbench task
74
- yourbench = create_yourbench_task("{dataset_name}", "multi_hop_questions")
75
 
76
  # Define TASKS_TABLE needed by lighteval
77
  TASKS_TABLE = [yourbench]
78
  """)
79
 
80
- # Create temporary output directory
81
- temp_output_dir = tempfile.mkdtemp(prefix="lighteval_")
 
82
 
83
  # LightEval command
84
  cmd_args = [
@@ -90,7 +138,8 @@ TASKS_TABLE = [yourbench]
90
  "--custom-tasks",
91
  temp_file_path,
92
  "--max-samples", "30",
93
- "--output-dir", temp_output_dir,
 
94
  "--no-push-to-hub"
95
  ]
96
 
@@ -104,30 +153,26 @@ TASKS_TABLE = [yourbench]
104
  )
105
 
106
  try:
107
- await asyncio.wait_for(process.communicate(), timeout=60)
108
  except asyncio.TimeoutError:
109
  process.kill()
110
  print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluation timed out for {model_name} after {time.time() - start_time:.2f}s")
111
 
112
- # Clean up temporary files and directories
113
  os.unlink(temp_file_path)
114
- import shutil
115
- shutil.rmtree(temp_output_dir, ignore_errors=True)
116
 
117
  return {
118
  "model": model_name,
119
  "provider": provider,
120
  "accuracy": 0.0,
121
- "execution_time": 60.0,
122
  "status": "timeout"
123
  }
124
  except Exception as e:
125
  print(f"[{datetime.now().strftime('%H:%M:%S')}] Error running evaluation for {model_name}: {str(e)}")
126
 
127
- # Clean up temporary files and directories
128
  os.unlink(temp_file_path)
129
- import shutil
130
- shutil.rmtree(temp_output_dir, ignore_errors=True)
131
 
132
  return {
133
  "model": model_name,
@@ -143,7 +188,7 @@ TASKS_TABLE = [yourbench]
143
 
144
  try:
145
  # Get results from the output file
146
- results_dir = Path(temp_output_dir) / "results" / model_name.replace("/", "/")
147
  results_file = next(results_dir.glob("results_*.json"))
148
 
149
  with open(results_file) as f:
@@ -167,17 +212,21 @@ TASKS_TABLE = [yourbench]
167
  "status": "parse_error"
168
  }
169
 
170
- # Clean up temporary files and directories
171
  os.unlink(temp_file_path)
172
- import shutil
173
- shutil.rmtree(temp_output_dir, ignore_errors=True)
174
 
175
  return result_data
176
 
177
- async def run(self) -> None:
178
  """
179
  Run the evaluation task asynchronously
 
 
 
180
  """
 
 
 
181
  # Start global timer
182
  script_start_time = time.time()
183
 
 
10
  from dotenv import load_dotenv
11
  from datetime import datetime
12
  import json
13
+ import shutil
14
  from typing import List, Dict
15
  from tasks.get_model_providers import get_model_providers
16
  from huggingface_hub import HfApi
17
  import asyncio
18
 
19
+ # Augmenter le timeout pour donner plus de temps aux modèles avec sambanova
20
+ EVALUATION_TIMEOUT = 60.0 # 5 minutes
21
+
22
  class EvaluationTask:
23
  """
24
  Task to run evaluation using lighteval
25
  """
26
 
27
+ def __init__(self, session_uid: str, dataset_name: str, clean_old_results: bool = False):
28
  """
29
  Initialize the evaluation task
30
 
31
  Args:
32
  session_uid: Session ID for this task
33
  dataset_name: Name of the dataset to evaluate
34
+ clean_old_results: If True, clean old results before evaluation
35
  """
36
  self.session_uid = session_uid
37
  self.dataset_name = dataset_name
38
  self.is_completed = False
39
  self.results = []
40
  self.hf_api = HfApi()
41
+
42
+ # Nettoyer les anciens résultats si demandé
43
+ if clean_old_results:
44
+ self.clean_old_results()
45
+
46
+ def clean_old_results(self) -> None:
47
+ """
48
+ Nettoie les anciens résultats d'évaluation pour éviter toute confusion
49
+ """
50
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Vérification et nettoyage des anciens résultats...")
51
+
52
+ # Chemin vers les résultats LightEval
53
+ results_dir = Path(f"uploaded_files/{self.session_uid}/lighteval_results")
54
+
55
+ # Supprimer si existant
56
+ if results_dir.exists():
57
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Suppression des anciens résultats LightEval")
58
+ shutil.rmtree(results_dir)
59
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Nettoyage terminé")
60
+ else:
61
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Aucun ancien résultat trouvé")
62
+
63
+ # Vérifier aussi les résultats intermédiaires de lighteval
64
+ if os.path.exists("data/lighteval_results"):
65
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Nettoyage des résultats intermédiaires")
66
+ try:
67
+ shutil.rmtree("data/lighteval_results", ignore_errors=True)
68
+ except Exception as e:
69
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Erreur lors du nettoyage des résultats intermédiaires: {str(e)}")
70
 
71
  def _save_results_to_hub(self) -> None:
72
  """
73
  Save evaluation results directly to the dataset on the Hub without persisting locally
74
  """
75
  try:
76
+ # Trier les résultats par précision (du plus précis au moins précis)
77
+ sorted_results = sorted(self.results, key=lambda x: x.get('accuracy', 0), reverse=True)
78
+
79
  # Créer un fichier temporaire pour les résultats
80
  with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
81
+ # Ajouter metadata aux résultats
82
+ final_results = {
83
+ "metadata": {
84
+ "evaluation_date": datetime.now().isoformat(),
85
+ "session_id": self.session_uid,
86
+ "dataset_name": self.dataset_name
87
+ },
88
+ "results": sorted_results
89
+ }
90
+
91
+ json.dump(final_results, temp_file, indent=2)
92
  temp_file_path = temp_file.name
93
 
94
  # Push to Hub
 
118
  from lighteval_task.lighteval_task import create_yourbench_task
119
 
120
  # Create yourbench task
121
+ yourbench = create_yourbench_task("{dataset_name}", "single_shot_questions")
122
 
123
  # Define TASKS_TABLE needed by lighteval
124
  TASKS_TABLE = [yourbench]
125
  """)
126
 
127
+ # Create output directory in the session folder
128
+ output_dir = f"uploaded_files/{self.session_uid}/lighteval_results"
129
+ os.makedirs(output_dir, exist_ok=True)
130
 
131
  # LightEval command
132
  cmd_args = [
 
138
  "--custom-tasks",
139
  temp_file_path,
140
  "--max-samples", "30",
141
+ "--output-dir", output_dir,
142
+ "--save-details",
143
  "--no-push-to-hub"
144
  ]
145
 
 
153
  )
154
 
155
  try:
156
+ await asyncio.wait_for(process.communicate(), timeout=EVALUATION_TIMEOUT)
157
  except asyncio.TimeoutError:
158
  process.kill()
159
  print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluation timed out for {model_name} after {time.time() - start_time:.2f}s")
160
 
161
+ # Clean up temporary files
162
  os.unlink(temp_file_path)
 
 
163
 
164
  return {
165
  "model": model_name,
166
  "provider": provider,
167
  "accuracy": 0.0,
168
+ "execution_time": EVALUATION_TIMEOUT,
169
  "status": "timeout"
170
  }
171
  except Exception as e:
172
  print(f"[{datetime.now().strftime('%H:%M:%S')}] Error running evaluation for {model_name}: {str(e)}")
173
 
174
+ # Clean up temporary files
175
  os.unlink(temp_file_path)
 
 
176
 
177
  return {
178
  "model": model_name,
 
188
 
189
  try:
190
  # Get results from the output file
191
+ results_dir = Path(output_dir) / "results" / model_name.replace("/", "/")
192
  results_file = next(results_dir.glob("results_*.json"))
193
 
194
  with open(results_file) as f:
 
212
  "status": "parse_error"
213
  }
214
 
215
+ # Clean up temporary files
216
  os.unlink(temp_file_path)
 
 
217
 
218
  return result_data
219
 
220
+ async def run(self, clean_first: bool = True) -> None:
221
  """
222
  Run the evaluation task asynchronously
223
+
224
+ Args:
225
+ clean_first: If True, clean old results before starting (par défaut: True)
226
  """
227
+ # Nettoyer systématiquement les anciens résultats avant de commencer
228
+ self.clean_old_results()
229
+
230
  # Start global timer
231
  script_start_time = time.time()
232
 
backend/tasks/get_model_providers.py CHANGED
@@ -2,15 +2,30 @@ from huggingface_hub import model_info
2
  PREFERRED_PROVIDERS = ["sambanova", "novita"]
3
 
4
  def filter_providers(providers):
 
5
  return [provider for provider in providers if provider in PREFERRED_PROVIDERS]
6
 
7
- def get_model_providers(models):
 
 
 
 
 
 
 
8
  results = []
9
 
10
  for model_name in models:
11
  try:
12
  info = model_info(model_name, expand="inferenceProviderMapping")
13
- providers = filter_providers(info.inference_provider_mapping.keys()) if hasattr(info, "inference_provider_mapping") else []
 
 
 
 
 
 
 
14
  results.append((model_name, providers))
15
  except Exception as e:
16
  results.append((model_name, []))
@@ -25,5 +40,5 @@ if __name__ == "__main__":
25
  "Qwen/QwQ-32B",
26
  "mistralai/Mistral-Small-24B-Instruct-2501"
27
  ]
28
- results = get_model_providers(example_models)
29
  print(results)
 
2
  PREFERRED_PROVIDERS = ["sambanova", "novita"]
3
 
4
  def filter_providers(providers):
5
+ """Filter providers to only include preferred ones."""
6
  return [provider for provider in providers if provider in PREFERRED_PROVIDERS]
7
 
8
+ def prioritize_providers(providers):
9
+ """Prioritize preferred providers, keeping all others."""
10
+ preferred = [provider for provider in providers if provider in PREFERRED_PROVIDERS]
11
+ non_preferred = [provider for provider in providers if provider not in PREFERRED_PROVIDERS]
12
+ return preferred + non_preferred
13
+
14
+ def get_model_providers(models, prioritize=True):
15
+ """Get model providers, optionally prioritizing preferred ones."""
16
  results = []
17
 
18
  for model_name in models:
19
  try:
20
  info = model_info(model_name, expand="inferenceProviderMapping")
21
+ if hasattr(info, "inference_provider_mapping"):
22
+ providers = info.inference_provider_mapping.keys()
23
+ if prioritize:
24
+ providers = prioritize_providers(providers)
25
+ else:
26
+ providers = filter_providers(providers)
27
+ else:
28
+ providers = []
29
  results.append((model_name, providers))
30
  except Exception as e:
31
  results.append((model_name, []))
 
40
  "Qwen/QwQ-32B",
41
  "mistralai/Mistral-Small-24B-Instruct-2501"
42
  ]
43
+ results = get_model_providers(example_models, prioritize=True)
44
  print(results)
backend/tests/explore_yourbench_dataset.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # Script to explore and log the content of the YouRBench test dataset
3
+
4
+ import os
5
+ from datasets import load_dataset
6
+ from loguru import logger
7
+ import json
8
+ from dotenv import load_dotenv
9
+ import sys
10
+
11
+ # Load environment variables
12
+ load_dotenv()
13
+
14
+ # Get Hugging Face token
15
+ hf_token = os.getenv("HF_TOKEN")
16
+ if not hf_token:
17
+ logger.warning("HF_TOKEN not found in .env file. Access to private datasets may be limited.")
18
+
19
+ # Set up logger
20
+ logger.remove()
21
+ logger.add(
22
+ "logs/yourbench_dataset_exploration.log",
23
+ level="INFO",
24
+ rotation="10 MB",
25
+ retention="1 week",
26
+ format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}"
27
+ )
28
+ # Add console output
29
+ logger.add(
30
+ sys.stdout,
31
+ level="INFO",
32
+ format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}"
33
+ )
34
+
35
+ logger.info("Starting YouRBench dataset exploration")
36
+
37
+ try:
38
+ # Load the dataset
39
+ dataset_name = "yourbench/yourbench_test"
40
+ logger.info(f"Loading dataset: {dataset_name}")
41
+ dataset = load_dataset(dataset_name, token=hf_token)
42
+
43
+ # Log dataset structure
44
+ logger.info(f"Dataset structure: {dataset}")
45
+
46
+ # Explore each split in the dataset
47
+ for split_name, split_dataset in dataset.items():
48
+ logger.info(f"\n{'='*50}\nExploring split: {split_name}\n{'='*50}")
49
+ logger.info(f"Number of examples: {len(split_dataset)}")
50
+ logger.info(f"Features: {split_dataset.features}")
51
+
52
+ # Sample and log a few examples
53
+ num_samples = min(3, len(split_dataset))
54
+ logger.info(f"\nShowing {num_samples} sample examples:")
55
+
56
+ for i in range(num_samples):
57
+ example = split_dataset[i]
58
+ # Convert to JSON for better readability
59
+ example_json = json.dumps(example, indent=2, ensure_ascii=False)
60
+ logger.info(f"\nExample {i}:\n{example_json}")
61
+
62
+ # Additional dataset statistics
63
+ if hasattr(split_dataset, 'column_names'):
64
+ logger.info(f"\nColumn names: {split_dataset.column_names}")
65
+
66
+ # Log count of unique values for categorical columns if not too many
67
+ for column in split_dataset.column_names:
68
+ try:
69
+ if split_dataset.features[column].dtype in ['string', 'bool', 'int32', 'int64']:
70
+ unique_values = set(split_dataset[column])
71
+ if len(unique_values) < 20: # Only if there aren't too many unique values
72
+ logger.info(f"Unique values in '{column}': {unique_values}")
73
+ except Exception as e:
74
+ logger.warning(f"Couldn't analyze column '{column}': {e}")
75
+
76
+ except Exception as e:
77
+ logger.error(f"Error exploring dataset: {e}")
78
+
79
+ logger.info("Dataset exploration completed")
backend/tests/model_provider_benchmark.py ADDED
@@ -0,0 +1,404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Script to benchmark the performance of different providers for a given model.
4
+
5
+ Usage: python model_provider_benchmark.py [--model "model_name"] [--output results.json] [--questions 5]
6
+ """
7
+
8
+ import argparse
9
+ import json
10
+ import time
11
+ import os
12
+ import requests
13
+ from typing import List, Dict, Any, Tuple, Optional
14
+ import logging
15
+ from datetime import datetime
16
+ from dotenv import load_dotenv
17
+ from huggingface_hub import model_info
18
+
19
+ # Logging configuration
20
+ logging.basicConfig(
21
+ level=logging.INFO,
22
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
23
+ )
24
+ logger = logging.getLogger("provider_benchmark")
25
+
26
+ # Default models to test
27
+ DEFAULT_MODELS = [
28
+ "Qwen/Qwen2.5-72B-Instruct",
29
+ "meta-llama/Llama-3.3-70B-Instruct",
30
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
31
+ "Qwen/QwQ-32B",
32
+ "mistralai/Mistral-Small-24B-Instruct-2501"
33
+ ]
34
+
35
+ # Questions to benchmark the models
36
+ DEFAULT_QUESTIONS = [
37
+ "What are the key benefits of using distributed systems?",
38
+ "Explain the concept of quantum computing in simple terms.",
39
+ "What are the ethical considerations in artificial intelligence?",
40
+ "Compare and contrast supervised and unsupervised learning.",
41
+ "How does blockchain technology ensure security and transparency?"
42
+ ]
43
+
44
+ def get_model_providers(model_name: str) -> List[str]:
45
+ """
46
+ Gets all available providers for a given model.
47
+
48
+ Args:
49
+ model_name: Name of the model on the Hub
50
+
51
+ Returns:
52
+ List of available providers
53
+ """
54
+ try:
55
+ info = model_info(model_name, expand="inferenceProviderMapping")
56
+ if hasattr(info, "inference_provider_mapping"):
57
+ providers = list(info.inference_provider_mapping.keys())
58
+ return providers
59
+ else:
60
+ logger.warning(f"No providers available for {model_name}")
61
+ return []
62
+ except Exception as e:
63
+ logger.error(f"Error while retrieving providers for {model_name}: {e}")
64
+ return []
65
+
66
+ def query_model(
67
+ model: str,
68
+ provider: str,
69
+ prompt: str,
70
+ token: str
71
+ ) -> Tuple[str, float]:
72
+ """
73
+ Sends a request to a model via the Inference Endpoints API.
74
+
75
+ Args:
76
+ model: Model name
77
+ provider: Provider name
78
+ prompt: Question to ask
79
+ token: HF token for authentication
80
+
81
+ Returns:
82
+ Tuple containing the response and execution time
83
+ """
84
+ headers = {
85
+ "Authorization": f"Bearer {token}",
86
+ "Content-Type": "application/json"
87
+ }
88
+
89
+ payload = {
90
+ "inputs": prompt,
91
+ "parameters": {
92
+ "max_new_tokens": 100,
93
+ "temperature": 0.7,
94
+ "top_p": 0.9,
95
+ "do_sample": True,
96
+ "provider": provider # Add provider in the parameters
97
+ }
98
+ }
99
+
100
+ # Build the Inference API URL without provider parameter
101
+ api_url = f"https://api-inference.huggingface.co/models/{model}"
102
+
103
+ start_time = time.time()
104
+ try:
105
+ # Add a small delay between requests to avoid rate limiting
106
+ time.sleep(0.5)
107
+
108
+ response = requests.post(api_url, headers=headers, json=payload)
109
+
110
+ # Check for specific error cases
111
+ if response.status_code != 200:
112
+ try:
113
+ error_data = response.json()
114
+ error_msg = error_data.get("error", str(error_data))
115
+ except:
116
+ error_msg = response.text
117
+ logger.error(f"Error for {model} ({provider}): {error_msg}")
118
+ return f"ERROR: {error_msg}", 0
119
+
120
+ response.raise_for_status()
121
+ result = response.json()
122
+
123
+ # API can return different formats, let's try to normalize
124
+ if isinstance(result, list) and len(result) > 0:
125
+ if "generated_text" in result[0]:
126
+ answer = result[0]["generated_text"]
127
+ else:
128
+ answer = str(result)
129
+ elif isinstance(result, dict):
130
+ if "generated_text" in result:
131
+ answer = result["generated_text"]
132
+ else:
133
+ answer = str(result)
134
+ else:
135
+ answer = str(result)
136
+
137
+ except requests.exceptions.RequestException as e:
138
+ error_msg = str(e)
139
+ logger.error(f"Error for {model} ({provider}): {error_msg}")
140
+ return f"ERROR: {error_msg}", 0
141
+ except Exception as e:
142
+ error_msg = str(e)
143
+ logger.error(f"Error for {model} ({provider}): {error_msg}")
144
+ return f"ERROR: {error_msg}", 0
145
+
146
+ end_time = time.time()
147
+ execution_time = end_time - start_time
148
+
149
+ return answer, execution_time
150
+
151
+ def run_benchmark(
152
+ model: str,
153
+ questions: List[str] = DEFAULT_QUESTIONS,
154
+ output_file: str = None
155
+ ) -> Optional[List[Dict[str, Any]]]:
156
+ """
157
+ Runs a benchmark for all model/provider combinations.
158
+
159
+ Args:
160
+ model: Name of the model to test
161
+ questions: List of questions to ask
162
+ output_file: Path to the output JSON file (optional)
163
+
164
+ Returns:
165
+ List of ranked providers or None in case of error
166
+ """
167
+ # Load environment variables
168
+ load_dotenv()
169
+
170
+ # Get HF token (without reading directly from .env file)
171
+ hf_token = os.environ.get("HF_TOKEN")
172
+ if not hf_token:
173
+ logger.error("HF_TOKEN not defined")
174
+ return None
175
+
176
+ # Get all available providers for this model
177
+ providers = get_model_providers(model)
178
+ if not providers:
179
+ logger.warning(f"No providers for {model}")
180
+ return None
181
+
182
+ logger.info(f"Testing {model} with providers: {', '.join(providers)}")
183
+
184
+ # Structure to store results
185
+ results = {
186
+ "providers": {}
187
+ }
188
+
189
+ # Test each provider
190
+ for provider in providers:
191
+ logger.info(f"Provider: {provider}")
192
+ provider_results = {
193
+ "questions": [],
194
+ "total_time": 0,
195
+ "average_time": 0,
196
+ "success_rate": 0
197
+ }
198
+
199
+ successful_queries = 0
200
+ total_time = 0
201
+
202
+ # Ask each question
203
+ for i, question in enumerate(questions):
204
+ answer, execution_time = query_model(
205
+ model=model,
206
+ provider=provider,
207
+ prompt=question,
208
+ token=hf_token
209
+ )
210
+
211
+ # Check if the request was successful
212
+ is_error = answer.startswith("ERROR:")
213
+ if not is_error:
214
+ successful_queries += 1
215
+ total_time += execution_time
216
+
217
+ # Save results for this question
218
+ provider_results["questions"].append({
219
+ "question": question,
220
+ "time": execution_time,
221
+ "success": not is_error,
222
+ "answer": answer[:100] + "..." if len(answer) > 100 else answer
223
+ })
224
+
225
+ # Calculate global metrics
226
+ provider_results["total_time"] = total_time
227
+ provider_results["average_time"] = total_time / successful_queries if successful_queries > 0 else 0
228
+ provider_results["success_rate"] = successful_queries / len(questions)
229
+
230
+ # Add results for this provider
231
+ results["providers"][provider] = provider_results
232
+
233
+ # Check if at least one provider succeeded
234
+ if not any(data["success_rate"] > 0 for data in results["providers"].values()):
235
+ logger.warning(f"No successful providers for {model}")
236
+ return None
237
+
238
+ # Create a ranked list of providers
239
+ sorted_providers = sorted(
240
+ results["providers"].items(),
241
+ key=lambda x: x[1]["total_time"] if x[1]["success_rate"] > 0 else float('inf')
242
+ )
243
+
244
+ # Return only the ranked list of providers
245
+ return [
246
+ {
247
+ "provider": provider,
248
+ "total_time": data["total_time"],
249
+ "success_rate": data["success_rate"],
250
+ "average_time": data["average_time"]
251
+ }
252
+ for provider, data in sorted_providers
253
+ ]
254
+
255
+ def display_results(model: str, results: List[Dict[str, Any]]) -> None:
256
+ """
257
+ Displays benchmark results in a readable format.
258
+
259
+ Args:
260
+ model: Model name
261
+ results: List of ranked providers
262
+ """
263
+ print(f"\n===== Benchmark Results for {model} =====")
264
+ print(f"Number of providers tested: {len(results)}")
265
+
266
+ print("\nProvider Rankings (fastest to slowest):")
267
+ print("-" * 80)
268
+ print(f"{'Rank':<6} {'Provider':<20} {'Success Rate':<15} {'Total Time (s)':<20} {'Avg Time (s)':<15}")
269
+ print("-" * 80)
270
+
271
+ for i, provider_data in enumerate(results, 1):
272
+ print(f"{i:<6} {provider_data['provider']:<20} {provider_data['success_rate']*100:>6.1f}% {provider_data['total_time']:>8.2f}s {provider_data['average_time']:>6.2f}s")
273
+
274
+ def calculate_model_rankings(all_results: Dict[str, Any]) -> List[Dict[str, Any]]:
275
+ """
276
+ Calculates model rankings based on their performance.
277
+
278
+ Args:
279
+ all_results: Complete benchmark results
280
+
281
+ Returns:
282
+ List of models ranked by performance
283
+ """
284
+ model_rankings = []
285
+
286
+ for model_name, results in all_results["models"].items():
287
+ if results is None:
288
+ continue
289
+
290
+ # Find the fastest provider with a good success rate
291
+ best_provider = None
292
+ best_time = float('inf')
293
+ best_success_rate = 0
294
+
295
+ for provider_data in results:
296
+ if provider_data["success_rate"] >= 0.8: # Only consider providers with at least 80% success rate
297
+ if provider_data["total_time"] < best_time:
298
+ best_time = provider_data["total_time"]
299
+ best_success_rate = provider_data["success_rate"]
300
+ best_provider = provider_data["provider"]
301
+
302
+ if best_provider:
303
+ model_rankings.append({
304
+ "model": model_name,
305
+ "best_provider": best_provider,
306
+ "total_time": best_time,
307
+ "success_rate": best_success_rate,
308
+ "average_time": best_time / 5 # 5 questions by default
309
+ })
310
+
311
+ # Sort by total time (fastest first)
312
+ return sorted(model_rankings, key=lambda x: x["total_time"])
313
+
314
+ def display_final_rankings(model_rankings: List[Dict[str, Any]]) -> None:
315
+ """
316
+ Displays the final model rankings.
317
+
318
+ Args:
319
+ model_rankings: List of ranked models
320
+ """
321
+ print("\n" + "="*80)
322
+ print("FINAL MODEL RANKINGS (fastest to slowest)")
323
+ print("="*80)
324
+ print(f"{'Rank':<6} {'Model':<40} {'Provider':<20} {'Total Time (s)':<15} {'Success Rate':<15}")
325
+ print("-"*80)
326
+
327
+ for i, model_data in enumerate(model_rankings, 1):
328
+ print(f"{i:<6} {model_data['model']:<40} {model_data['best_provider']:<20} "
329
+ f"{model_data['total_time']:>8.2f}s {model_data['success_rate']*100:>6.1f}%")
330
+
331
+ def display_final_summary(all_results: Dict[str, Any]) -> None:
332
+ """
333
+ Displays a final summary with ranked providers for each model.
334
+
335
+ Args:
336
+ all_results: Complete benchmark results
337
+ """
338
+ print("\n" + "="*100)
339
+ print("FINAL SUMMARY OF PROVIDERS BY MODEL")
340
+ print("="*100)
341
+
342
+ for model_name, results in all_results["models"].items():
343
+ if results is None:
344
+ print(f"\n{model_name}:")
345
+ print(" No successful providers found")
346
+ continue
347
+
348
+ print(f"\n{model_name}:")
349
+ print(" Successful providers:")
350
+ for provider_data in results:
351
+ if provider_data["success_rate"] > 0:
352
+ print(f" - {provider_data['provider']} (Success rate: {provider_data['success_rate']*100:.1f}%, Avg time: {provider_data['average_time']:.2f}s)")
353
+
354
+ # Check for failed providers
355
+ failed_providers = [p for p in results if p["success_rate"] == 0]
356
+ if failed_providers:
357
+ print(" Failed providers:")
358
+ for provider_data in failed_providers:
359
+ print(f" - {provider_data['provider']}")
360
+
361
+ def main():
362
+ """
363
+ Main entry point for the script.
364
+ """
365
+ parser = argparse.ArgumentParser(description="Tests the performance of model providers.")
366
+ parser.add_argument("--model", type=str, help="Name of the model to test (if not specified, all default models will be tested)")
367
+ parser.add_argument("--output", type=str, default="benchmark_results.json", help="Path to the output JSON file")
368
+ parser.add_argument("--questions", type=int, default=5, help="Number of questions to ask (default: 5)")
369
+
370
+ args = parser.parse_args()
371
+
372
+ # Limit the number of questions to the maximum available
373
+ num_questions = min(args.questions, len(DEFAULT_QUESTIONS))
374
+ questions = DEFAULT_QUESTIONS[:num_questions]
375
+
376
+ # Determine which models to test
377
+ models_to_test = [args.model] if args.model else DEFAULT_MODELS
378
+
379
+ # Structure to store all results
380
+ all_results = {
381
+ "timestamp": datetime.now().isoformat(),
382
+ "models": {}
383
+ }
384
+
385
+ # Test each model
386
+ for model in models_to_test:
387
+ logger.info(f"\nModel: {model}")
388
+ results = run_benchmark(
389
+ model=model,
390
+ questions=questions,
391
+ output_file=None # We don't save individually
392
+ )
393
+ all_results["models"][model] = results
394
+
395
+ # Save all results
396
+ with open(args.output, "w") as f:
397
+ json.dump(all_results, f, indent=2)
398
+ logger.info(f"\nResults saved to {args.output}")
399
+
400
+ # Display only the final summary
401
+ display_final_summary(all_results)
402
+
403
+ if __name__ == "__main__":
404
+ main()
backend/tests/run_bench.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import os
3
+ from dotenv import load_dotenv
4
+ import time
5
+
6
+ # Load environment variables from .env
7
+ load_dotenv()
8
+
9
+ # Configuration file path
10
+ config_path = "data/config.yml"
11
+
12
+ # Command to run
13
+ command = ["yourbench", "run", "--config", config_path]
14
+
15
+ # Start timer
16
+ start_time = time.time()
17
+
18
+ # Run the command with environment variables
19
+ subprocess.run(command, env=os.environ)
20
+
21
+ # Calculate and print execution time
22
+ execution_time = time.time() - start_time
23
+ print(f"\nTemps d'exécution : {execution_time:.2f} secondes")
backend/tests/run_lighteval.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import subprocess
4
+ from dotenv import load_dotenv
5
+ import time
6
+ from lighteval_task.lighteval_task import create_yourbench_task
7
+ import datetime
8
+
9
+ # Load environment variables
10
+ load_dotenv()
11
+
12
+ # Create temporary task file
13
+ temp_file_path = tempfile.mktemp(suffix=".py")
14
+ with open(temp_file_path, 'w') as temp_file:
15
+ temp_file.write("""
16
+ from lighteval_task.lighteval_task import create_yourbench_task
17
+
18
+ # Create yourbench task
19
+ yourbench = create_yourbench_task("yourbench/yourbench_fbfe278f-70c8-4579-9447-8275b94250bd", "single_shot_questions")
20
+
21
+ # Define TASKS_TABLE needed by lighteval
22
+ TASKS_TABLE = [yourbench]
23
+ """)
24
+
25
+ # Créer un dossier de sortie avec timestamp pour éviter d'écraser les anciens résultats
26
+ output_dir = f"data/lighteval_results_strict_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
27
+
28
+ # LightEval command
29
+ cmd_args = [
30
+ "lighteval",
31
+ "endpoint",
32
+ "inference-providers",
33
+ "model=Qwen/Qwen2.5-72B-Instruct,provider=novita",
34
+ "custom|yourbench|0|0",
35
+ "--custom-tasks",
36
+ temp_file_path,
37
+ "--max-samples", "10",
38
+ "--output-dir", output_dir,
39
+ "--save-details",
40
+ "--no-push-to-hub"
41
+ ]
42
+
43
+ # Start timer
44
+ start_time = time.time()
45
+
46
+ # Run the command with environment variables
47
+ subprocess.run(cmd_args, env=os.environ)
48
+
49
+ # Calculate and print execution time
50
+ execution_time = time.time() - start_time
51
+ print(f"\nTemps d'exécution : {execution_time:.2f} secondes")
52
+ print(f"Résultats sauvegardés dans : {output_dir}")
53
+
54
+ # Clean up
55
+ os.unlink(temp_file_path)
backend/tests/run_parallel_lighteval.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import time
3
+ import subprocess
4
+ import os
5
+ import json
6
+ from pathlib import Path
7
+ import concurrent.futures
8
+ from dotenv import load_dotenv
9
+ from datetime import datetime
10
+ import yaml
11
+ import argparse
12
+ from typing import Dict, Any
13
+ from tqdm import tqdm
14
+ from tools.lighteval.get_model_providers import get_model_providers
15
+
16
+ def run_lighteval(model_name: str, provider: str) -> dict:
17
+ start_time = time.time()
18
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting evaluation with {provider} provider for {model_name}")
19
+
20
+ # Create temporary task file
21
+ temp_file_path = tempfile.mktemp(suffix=".py")
22
+ with open(temp_file_path, 'w') as temp_file:
23
+ temp_file.write("""
24
+ from lighteval_task.lighteval_task import create_yourbench_task
25
+
26
+ # Create yourbench task
27
+ yourbench = create_yourbench_task("yourbench/yourbench_test", "single_shot_questions")
28
+
29
+ # Define TASKS_TABLE needed by lighteval
30
+ TASKS_TABLE = [yourbench]
31
+ """)
32
+
33
+ # LightEval command
34
+ cmd_args = [
35
+ "lighteval",
36
+ "endpoint",
37
+ "inference-providers",
38
+ f"model={model_name},provider={provider}",
39
+ "custom|yourbench|0|0",
40
+ "--custom-tasks",
41
+ temp_file_path,
42
+ "--max-samples", "3",
43
+ "--output-dir", "data/lighteval_results",
44
+ # "--save-details",
45
+ "--no-push-to-hub"
46
+ ]
47
+
48
+ try:
49
+ # Run the command with environment variables and timeout of 60 seconds
50
+ subprocess.run(cmd_args, env=os.environ, timeout=60)
51
+ except subprocess.TimeoutExpired:
52
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Evaluation timed out for {model_name} after {time.time() - start_time:.2f}s")
53
+ return {
54
+ "model": model_name,
55
+ "provider": provider,
56
+ "accuracy": 0.0,
57
+ "execution_time": 60.0,
58
+ "status": "timeout"
59
+ }
60
+
61
+ # Calculate execution time
62
+ execution_time = time.time() - start_time
63
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Finished evaluation for {model_name} in {execution_time:.2f}s")
64
+
65
+ # Clean up
66
+ os.unlink(temp_file_path)
67
+
68
+ try:
69
+ # Get results from the output file
70
+ results_dir = Path("data/lighteval_results/results") / model_name.replace("/", "/")
71
+ results_file = next(results_dir.glob("results_*.json"))
72
+
73
+ with open(results_file) as f:
74
+ results = json.load(f)
75
+ accuracy = results["results"]["all"]["accuracy"]
76
+
77
+ return {
78
+ "model": model_name,
79
+ "provider": provider,
80
+ "accuracy": accuracy,
81
+ "execution_time": execution_time,
82
+ "status": "success"
83
+ }
84
+ except Exception as e:
85
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Failed to parse results for {model_name} after {execution_time:.2f}s: {str(e)}")
86
+ return {
87
+ "model": model_name,
88
+ "provider": provider,
89
+ "accuracy": 0.0,
90
+ "execution_time": execution_time,
91
+ "status": "parse_error"
92
+ }
93
+
94
+ def main():
95
+ # Start global timer
96
+ script_start_time = time.time()
97
+
98
+ # Load environment variables
99
+ load_dotenv()
100
+
101
+ # Models to evaluate
102
+ models = [
103
+ "Qwen/QwQ-32B",
104
+ "Qwen/Qwen2.5-72B-Instruct",
105
+ "deepseek-ai/DeepSeek-V3-0324",
106
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
107
+ ]
108
+
109
+ # Get providers for each model
110
+ model_providers = get_model_providers(models)
111
+
112
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] Starting parallel evaluations")
113
+
114
+ # Run evaluations in parallel using ProcessPoolExecutor
115
+ with concurrent.futures.ProcessPoolExecutor() as executor:
116
+ futures = [
117
+ executor.submit(run_lighteval, model_name, providers[0])
118
+ for model_name, providers in model_providers
119
+ if providers # Only run if providers are available
120
+ ]
121
+ results = [future.result() for future in concurrent.futures.as_completed(futures)]
122
+
123
+ # Calculate total script execution time
124
+ total_time = time.time() - script_start_time
125
+ print(f"[{datetime.now().strftime('%H:%M:%S')}] All evaluations completed in {total_time:.2f}s")
126
+
127
+ # Print results in order
128
+ print("\nResults:")
129
+ print("-" * 80)
130
+ for result in results:
131
+ print(f"Model: {result['model']}")
132
+ print(f"Provider: {result['provider']}")
133
+ print(f"Accuracy: {result['accuracy']:.2f}")
134
+ print(f"Execution time: {result['execution_time']:.2f}s")
135
+ print("-" * 80)
136
+
137
+ if __name__ == "__main__":
138
+ main()