Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
add prerendered documents | update filename | refactor
Browse files- .gitignore +2 -0
- backend/main.py +2 -2
- backend/old-pyproject.toml +0 -26
- backend/poetry.lock +0 -0
- backend/requirements.txt +0 -1
- backend/routes/benchmark.py +2 -2
- backend/routes/evaluation.py +1 -1
- backend/routes/questions.py +0 -4
- backend/routes/upload.py +23 -0
- backend/tasks/{createBench.py → create_bench.py} +0 -0
- backend/tasks/{createBenchConfigFile.py → create_bench_config_file.py} +1 -1
- backend/tasks/{evaluationTask.py → evaluation_task.py} +0 -0
- backend/test_import.py +0 -5
- backend/tests/test_evaluation.py +0 -165
- backend/tests/test_hf_upload.py +0 -78
- backend/tests/test_inference.py +0 -84
- backend/tests/test_lighteval.py +0 -151
- backend/tests/test_openai.py +0 -31
- backend/tests/test_parallel_lighteval.py +0 -278
- backend/tests/test_provider_parallel_support.py +0 -227
- backend/tests/test_yourbench_results.py +0 -394
- backend/yourbench_simple_demo.egg-info/SOURCES.txt +0 -8
- frontend/src/components/BenchmarkCreateForm.jsx +199 -151
- frontend/src/components/BenchmarkEvaluation.jsx +56 -16
- frontend/src/components/BenchmarkGenerator.jsx +96 -4
- frontend/src/components/EvaluationDisplay.jsx +183 -25
- frontend/src/components/Intro.jsx +12 -5
- frontend/src/pages/BenchmarkDisplayPage.jsx +10 -1
- frontend/src/pages/BenchmarkEvaluationPage.jsx +13 -1
- frontend/src/pages/BenchmarkGenerationPage.jsx +2 -0
- frontend/src/pages/HomePage.jsx +6 -2
.gitignore
CHANGED
@@ -3,6 +3,8 @@
|
|
3 |
__pycache__
|
4 |
.cache/
|
5 |
|
|
|
|
|
6 |
# dependencies
|
7 |
|
8 |
frontend/node_modules
|
|
|
3 |
__pycache__
|
4 |
.cache/
|
5 |
|
6 |
+
*.egg-info
|
7 |
+
|
8 |
# dependencies
|
9 |
|
10 |
frontend/node_modules
|
backend/main.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
-
from fastapi import FastAPI
|
2 |
from fastapi.middleware.cors import CORSMiddleware
|
3 |
import os
|
4 |
from dotenv import load_dotenv
|
5 |
-
from routes import routers, session_files, active_bench_tasks
|
6 |
|
7 |
# Load environment variables from .env file
|
8 |
load_dotenv()
|
|
|
1 |
+
from fastapi import FastAPI, UploadFile, File, Form, Request, HTTPException
|
2 |
from fastapi.middleware.cors import CORSMiddleware
|
3 |
import os
|
4 |
from dotenv import load_dotenv
|
5 |
+
from routes import routers, session_files, active_bench_tasks, benchmark
|
6 |
|
7 |
# Load environment variables from .env file
|
8 |
load_dotenv()
|
backend/old-pyproject.toml
DELETED
@@ -1,26 +0,0 @@
|
|
1 |
-
[tool.poetry]
|
2 |
-
name = "llm-leaderboard-backend"
|
3 |
-
version = "0.1.0"
|
4 |
-
description = "Backend for the Open LLM Leaderboard"
|
5 |
-
authors = ["Your Name <[email protected]>"]
|
6 |
-
|
7 |
-
[tool.poetry.dependencies]
|
8 |
-
python = ">=3.12,<3.13"
|
9 |
-
fastapi = "^0.115.6"
|
10 |
-
huggingface-hub = "0.29.3"
|
11 |
-
python-dotenv = "^1.0.1"
|
12 |
-
python-multipart = "^0.0.9"
|
13 |
-
uvicorn = {extras = ["standard"], version = "^0.27.0"}
|
14 |
-
loguru = "^0.7.3"
|
15 |
-
lighteval = {version = ">=0.8.0", extras = ["math"]}
|
16 |
-
tqdm = "^4.67.1"
|
17 |
-
asyncio = "^3.4.3"
|
18 |
-
datasets = "^3.3.0"
|
19 |
-
yourbench = {git = "https://github.com/huggingface/yourbench.git"}
|
20 |
-
tiktoken = "^0.9.0"
|
21 |
-
requests = {extras = ["socks"], version = "^2.32.3"}
|
22 |
-
httpx-socks = "^0.10.0"
|
23 |
-
|
24 |
-
[build-system]
|
25 |
-
requires = ["poetry-core>=1.0.0"]
|
26 |
-
build-backend = "poetry.core.masonry.api"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/poetry.lock
DELETED
The diff for this file is too large to render.
See raw diff
|
|
backend/requirements.txt
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
|
|
|
|
backend/routes/benchmark.py
CHANGED
@@ -2,8 +2,8 @@ from fastapi import APIRouter, HTTPException
|
|
2 |
from typing import Dict, Any
|
3 |
import os
|
4 |
import time
|
5 |
-
from tasks.
|
6 |
-
from tasks.
|
7 |
|
8 |
router = APIRouter(tags=["benchmark"])
|
9 |
|
|
|
2 |
from typing import Dict, Any
|
3 |
import os
|
4 |
import time
|
5 |
+
from tasks.create_bench_config_file import CreateBenchConfigTask
|
6 |
+
from tasks.create_bench import CreateBenchTask
|
7 |
|
8 |
router = APIRouter(tags=["benchmark"])
|
9 |
|
backend/routes/evaluation.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
from fastapi import APIRouter, HTTPException
|
2 |
from typing import Dict, Any
|
3 |
import os
|
4 |
-
from tasks.
|
5 |
from huggingface_hub import hf_hub_download
|
6 |
import json
|
7 |
from datetime import datetime
|
|
|
1 |
from fastapi import APIRouter, HTTPException
|
2 |
from typing import Dict, Any
|
3 |
import os
|
4 |
+
from tasks.evaluation_task import EvaluationTask
|
5 |
from huggingface_hub import hf_hub_download
|
6 |
import json
|
7 |
from datetime import datetime
|
backend/routes/questions.py
CHANGED
@@ -37,9 +37,7 @@ async def get_benchmark_questions(session_id: str):
|
|
37 |
if single_dataset and len(single_dataset['train']) > 0:
|
38 |
# Get a random sample (up to 2) from single-shot questions
|
39 |
sample_indices = random.sample(range(len(single_dataset['train'])), min(2, len(single_dataset['train'])))
|
40 |
-
print(f"Dataset structure: {single_dataset['train'][0].keys()}")
|
41 |
for idx in sample_indices:
|
42 |
-
print(f"Question {idx} data: {single_dataset['train'][idx]}")
|
43 |
questions.append({
|
44 |
"id": str(idx),
|
45 |
"question": single_dataset['train'][idx].get("question", ""),
|
@@ -58,9 +56,7 @@ async def get_benchmark_questions(session_id: str):
|
|
58 |
# Get remaining questions from multi-hop questions
|
59 |
remaining = 2 - len(questions)
|
60 |
sample_indices = random.sample(range(len(multi_dataset['train'])), min(remaining, len(multi_dataset['train'])))
|
61 |
-
print(f"Multi-hop dataset structure: {multi_dataset['train'][0].keys()}")
|
62 |
for idx in sample_indices:
|
63 |
-
print(f"Multi-hop question {idx} data: {multi_dataset['train'][idx]}")
|
64 |
questions.append({
|
65 |
"id": str(idx),
|
66 |
"question": multi_dataset['train'][idx].get("question", ""),
|
|
|
37 |
if single_dataset and len(single_dataset['train']) > 0:
|
38 |
# Get a random sample (up to 2) from single-shot questions
|
39 |
sample_indices = random.sample(range(len(single_dataset['train'])), min(2, len(single_dataset['train'])))
|
|
|
40 |
for idx in sample_indices:
|
|
|
41 |
questions.append({
|
42 |
"id": str(idx),
|
43 |
"question": single_dataset['train'][idx].get("question", ""),
|
|
|
56 |
# Get remaining questions from multi-hop questions
|
57 |
remaining = 2 - len(questions)
|
58 |
sample_indices = random.sample(range(len(multi_dataset['train'])), min(remaining, len(multi_dataset['train'])))
|
|
|
59 |
for idx in sample_indices:
|
|
|
60 |
questions.append({
|
61 |
"id": str(idx),
|
62 |
"question": multi_dataset['train'][idx].get("question", ""),
|
backend/routes/upload.py
CHANGED
@@ -12,6 +12,29 @@ session_files = {}
|
|
12 |
UPLOAD_ROOT = "uploaded_files"
|
13 |
os.makedirs(UPLOAD_ROOT, exist_ok=True)
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
@router.post("/upload")
|
16 |
async def upload_file(file: UploadFile = File(...)):
|
17 |
"""
|
|
|
12 |
UPLOAD_ROOT = "uploaded_files"
|
13 |
os.makedirs(UPLOAD_ROOT, exist_ok=True)
|
14 |
|
15 |
+
# Initialize session files dictionary with pre-calculated documents
|
16 |
+
precalculated_docs = ["the-bitter-lesson", "hurricane-faq", "pokemon-guide"]
|
17 |
+
|
18 |
+
for doc_id in precalculated_docs:
|
19 |
+
doc_dir = os.path.join(UPLOAD_ROOT, doc_id)
|
20 |
+
if os.path.exists(doc_dir):
|
21 |
+
doc_files_dir = os.path.join(doc_dir, "uploaded_files")
|
22 |
+
if os.path.exists(doc_files_dir):
|
23 |
+
for filename in os.listdir(doc_files_dir):
|
24 |
+
if filename.endswith((".pdf", ".txt", ".html", ".md")):
|
25 |
+
file_path = os.path.join(doc_files_dir, filename)
|
26 |
+
session_files[doc_id] = file_path
|
27 |
+
print(f"Added pre-calculated document to session_files: {doc_id} -> {file_path}")
|
28 |
+
break
|
29 |
+
else:
|
30 |
+
# Search directly in the doc_dir
|
31 |
+
for filename in os.listdir(doc_dir):
|
32 |
+
if filename.endswith((".pdf", ".txt", ".html", ".md")):
|
33 |
+
file_path = os.path.join(doc_dir, filename)
|
34 |
+
session_files[doc_id] = file_path
|
35 |
+
print(f"Added pre-calculated document to session_files: {doc_id} -> {file_path}")
|
36 |
+
break
|
37 |
+
|
38 |
@router.post("/upload")
|
39 |
async def upload_file(file: UploadFile = File(...)):
|
40 |
"""
|
backend/tasks/{createBench.py → create_bench.py}
RENAMED
File without changes
|
backend/tasks/{createBenchConfigFile.py → create_bench_config_file.py}
RENAMED
@@ -114,7 +114,7 @@ class CreateBenchConfigTask:
|
|
114 |
"provider": "novita",
|
115 |
"api_key": "$HF_TOKEN",
|
116 |
"max_concurrent_requests": 32,
|
117 |
-
}
|
118 |
],
|
119 |
|
120 |
"model_roles": {
|
|
|
114 |
"provider": "novita",
|
115 |
"api_key": "$HF_TOKEN",
|
116 |
"max_concurrent_requests": 32,
|
117 |
+
}
|
118 |
],
|
119 |
|
120 |
"model_roles": {
|
backend/tasks/{evaluationTask.py → evaluation_task.py}
RENAMED
File without changes
|
backend/test_import.py
DELETED
@@ -1,5 +0,0 @@
|
|
1 |
-
try:
|
2 |
-
import lighteval_task
|
3 |
-
print("lighteval_task importé avec succès!")
|
4 |
-
except ImportError as e:
|
5 |
-
print(f"Erreur: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
backend/tests/test_evaluation.py
DELETED
@@ -1,165 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
-
"""
|
3 |
-
Script to test the evaluation task in standalone mode
|
4 |
-
"""
|
5 |
-
import os
|
6 |
-
import sys
|
7 |
-
import uuid
|
8 |
-
import json
|
9 |
-
import time
|
10 |
-
import argparse
|
11 |
-
from dotenv import load_dotenv
|
12 |
-
from pathlib import Path
|
13 |
-
import traceback
|
14 |
-
|
15 |
-
# Ensure the environment is properly configured
|
16 |
-
load_dotenv()
|
17 |
-
|
18 |
-
# Add the current directory to the path to import modules
|
19 |
-
sys.path.append(os.getcwd())
|
20 |
-
from tasks.evaluationTask import EvaluationTask
|
21 |
-
|
22 |
-
|
23 |
-
def setup_environment():
|
24 |
-
"""
|
25 |
-
Configure the environment for testing
|
26 |
-
"""
|
27 |
-
# Check if the HF token is defined
|
28 |
-
hf_token = os.getenv("HF_TOKEN")
|
29 |
-
if not hf_token:
|
30 |
-
print("⚠️ The HF_TOKEN is not defined in the environment or .env file")
|
31 |
-
print(" Please define this variable before continuing.")
|
32 |
-
sys.exit(1)
|
33 |
-
|
34 |
-
# Set the default organization if not defined
|
35 |
-
if not os.getenv("HF_ORGANIZATION"):
|
36 |
-
os.environ["HF_ORGANIZATION"] = "yourbench"
|
37 |
-
print("ℹ️ The HF_ORGANIZATION variable is not defined, using 'yourbench' as default")
|
38 |
-
|
39 |
-
|
40 |
-
def run_standalone_evaluation(dataset_name, models=None, max_wait_time=3600):
|
41 |
-
"""
|
42 |
-
Run the evaluation task in standalone mode
|
43 |
-
|
44 |
-
Args:
|
45 |
-
dataset_name: Name of the dataset to evaluate
|
46 |
-
models: List of models to evaluate (optional)
|
47 |
-
max_wait_time: Maximum waiting time in seconds
|
48 |
-
"""
|
49 |
-
# Generate a unique session ID
|
50 |
-
session_uid = str(uuid.uuid4())
|
51 |
-
print(f"🔧 Session ID: {session_uid}")
|
52 |
-
|
53 |
-
# Create the evaluation task instance
|
54 |
-
evaluation_task = EvaluationTask(session_uid, dataset_name)
|
55 |
-
|
56 |
-
# If specific models are provided, use them
|
57 |
-
if models:
|
58 |
-
evaluation_task.models = models
|
59 |
-
print(f"🤖 Using custom models: {models}")
|
60 |
-
|
61 |
-
# Display dataset information
|
62 |
-
organization = os.getenv("HF_ORGANIZATION", "yourbench")
|
63 |
-
print(f"📊 Evaluating dataset: {organization}/{dataset_name}")
|
64 |
-
print(f"💾 Results saved in: {evaluation_task.output_dir}")
|
65 |
-
|
66 |
-
# Start the evaluation task
|
67 |
-
print("🚀 Starting evaluation...")
|
68 |
-
evaluation_task.run()
|
69 |
-
|
70 |
-
# Wait for the task to complete while displaying logs
|
71 |
-
start_time = time.time()
|
72 |
-
last_log_count = 0
|
73 |
-
|
74 |
-
while not evaluation_task.is_task_completed():
|
75 |
-
current_logs = evaluation_task.get_logs()
|
76 |
-
|
77 |
-
# Display only new logs
|
78 |
-
if len(current_logs) > last_log_count:
|
79 |
-
for log in current_logs[last_log_count:]:
|
80 |
-
print(f" {log}")
|
81 |
-
last_log_count = len(current_logs)
|
82 |
-
|
83 |
-
# Check if the maximum time is reached
|
84 |
-
elapsed_time = time.time() - start_time
|
85 |
-
if elapsed_time > max_wait_time:
|
86 |
-
print("⚠️ Maximum waiting time reached, forced stop")
|
87 |
-
break
|
88 |
-
|
89 |
-
time.sleep(1)
|
90 |
-
|
91 |
-
# Check if results are available
|
92 |
-
results_file = Path(f"{evaluation_task.output_dir}/models_comparison.json")
|
93 |
-
if results_file.exists():
|
94 |
-
try:
|
95 |
-
with open(results_file, 'r') as f:
|
96 |
-
results = json.load(f)
|
97 |
-
|
98 |
-
print("\n📈 Evaluation Results:")
|
99 |
-
print(f" Dataset: {results['metadata']['dataset']}")
|
100 |
-
print(f" Models tested: {results['metadata']['total_models_tested']}")
|
101 |
-
print(f" Successful tests: {results['metadata']['successful_tests']}")
|
102 |
-
print(f" Timestamp: {results['metadata']['timestamp']}")
|
103 |
-
|
104 |
-
if results['metadata']['successful_tests'] > 0:
|
105 |
-
print("\n📊 Model ranking by accuracy:")
|
106 |
-
successful_models = [m for m in results['models_comparison'] if m['success']]
|
107 |
-
for i, model in enumerate(successful_models):
|
108 |
-
print(f" {i+1}. ✅ {model['model_name']} ({model['provider']})")
|
109 |
-
print(f" Accuracy: {model['accuracy']:.4f} ± {model['accuracy_stderr']:.4f}")
|
110 |
-
print(f" Evaluation time: {model['evaluation_time']:.2f}s")
|
111 |
-
|
112 |
-
failed_models = [m for m in results['models_comparison'] if not m['success']]
|
113 |
-
if failed_models:
|
114 |
-
print("\n❌ Unevaluated models:")
|
115 |
-
for i, model in enumerate(failed_models):
|
116 |
-
print(f" {i+1}. {model['model_name']} ({model['provider']})")
|
117 |
-
error_msg = model.get('error', 'Unknown reason')
|
118 |
-
print(f" Reason: {error_msg}")
|
119 |
-
|
120 |
-
# Check detailed results files
|
121 |
-
detailed_file = Path(f"{evaluation_task.output_dir}/detailed_results.json")
|
122 |
-
if detailed_file.exists():
|
123 |
-
print(f"\n📄 Detailed results available in: {detailed_file}")
|
124 |
-
|
125 |
-
# Check raw files
|
126 |
-
raw_results = list(Path(f"{evaluation_task.output_dir}/results").glob("**/*.json"))
|
127 |
-
if raw_results:
|
128 |
-
print(f"\n📁 {len(raw_results)} raw result files available in: {evaluation_task.output_dir}/results")
|
129 |
-
|
130 |
-
print(f"\n✅ Evaluation completed!")
|
131 |
-
except Exception as e:
|
132 |
-
print(f"❌ Error reading results: {str(e)}")
|
133 |
-
print(f" Details: {traceback.format_exc()}")
|
134 |
-
else:
|
135 |
-
print(f"❌ No evaluation results found in {results_file}")
|
136 |
-
|
137 |
-
|
138 |
-
if __name__ == "__main__":
|
139 |
-
# Configure the argument parser
|
140 |
-
parser = argparse.ArgumentParser(description="Test the evaluation task in standalone mode")
|
141 |
-
parser.add_argument("dataset_name", type=str, help="Name of the dataset to evaluate (without the organization)")
|
142 |
-
parser.add_argument("--model", action="append", dest="models",
|
143 |
-
help="Model to evaluate in the format 'name/model,provider'. Can be used multiple times.")
|
144 |
-
parser.add_argument("--timeout", type=int, default=3600,
|
145 |
-
help="Maximum waiting time in seconds (default: 3600)")
|
146 |
-
|
147 |
-
args = parser.parse_args()
|
148 |
-
|
149 |
-
# Configure the environment
|
150 |
-
setup_environment()
|
151 |
-
|
152 |
-
# Transform models into tuples if specified
|
153 |
-
models_to_evaluate = None
|
154 |
-
if args.models:
|
155 |
-
models_to_evaluate = []
|
156 |
-
for model_spec in args.models:
|
157 |
-
try:
|
158 |
-
model_name, provider = model_spec.split(",")
|
159 |
-
models_to_evaluate.append((model_name, provider))
|
160 |
-
except ValueError:
|
161 |
-
print(f"⚠️ Invalid model format: {model_spec}. Use 'name/model,provider'")
|
162 |
-
sys.exit(1)
|
163 |
-
|
164 |
-
# Run the evaluation
|
165 |
-
run_standalone_evaluation(args.dataset_name, models_to_evaluate, args.timeout)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/tests/test_hf_upload.py
DELETED
@@ -1,78 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Script de test pour vérifier l'upload vers Hugging Face Hub
|
3 |
-
Ce script crée un dataset simple et tente de l'uploader vers le Hub
|
4 |
-
en utilisant le token et l'organisation définis dans les variables d'environnement.
|
5 |
-
"""
|
6 |
-
import os
|
7 |
-
import sys
|
8 |
-
import datasets
|
9 |
-
from huggingface_hub import HfApi, login
|
10 |
-
from datasets import Dataset
|
11 |
-
from dotenv import load_dotenv
|
12 |
-
from pathlib import Path
|
13 |
-
|
14 |
-
def test_hf_upload():
|
15 |
-
# Charger les variables d'environnement depuis le fichier .env
|
16 |
-
dotenv_path = Path('.env')
|
17 |
-
load_dotenv(dotenv_path=dotenv_path)
|
18 |
-
|
19 |
-
# Récupérer le token et l'organisation des variables d'environnement
|
20 |
-
hf_token = os.getenv("HF_TOKEN")
|
21 |
-
org_name = os.getenv("HF_ORGANIZATION", "yourbench")
|
22 |
-
|
23 |
-
if not hf_token:
|
24 |
-
print("Erreur: La variable HF_TOKEN n'est pas définie dans le fichier .env.")
|
25 |
-
sys.exit(1)
|
26 |
-
|
27 |
-
dataset_name = "test_dataset_upload"
|
28 |
-
repo_id = f"{org_name}/{dataset_name}"
|
29 |
-
|
30 |
-
print(f"Tentative d'upload vers {repo_id} avec le token {hf_token[:5]}... (token tronqué pour la sécurité)")
|
31 |
-
|
32 |
-
try:
|
33 |
-
# Se connecter à l'API Hugging Face
|
34 |
-
print("Connexion à l'API Hugging Face...")
|
35 |
-
login(token=hf_token)
|
36 |
-
api = HfApi(token=hf_token)
|
37 |
-
|
38 |
-
# Créer un dataset simple
|
39 |
-
print("Création d'un dataset de test...")
|
40 |
-
data = {
|
41 |
-
"text": ["Ceci est un test", "Un autre exemple", "Troisième exemple"],
|
42 |
-
"label": [1, 0, 1]
|
43 |
-
}
|
44 |
-
dataset = Dataset.from_dict(data)
|
45 |
-
|
46 |
-
# Vérifier si le repo existe déjà et le supprimer si nécessaire
|
47 |
-
try:
|
48 |
-
api.delete_repo(repo_id=repo_id, repo_type="dataset")
|
49 |
-
print(f"Repo existant {repo_id} supprimé.")
|
50 |
-
except Exception:
|
51 |
-
print(f"Le repo {repo_id} n'existait pas encore.")
|
52 |
-
|
53 |
-
# Uploader le dataset
|
54 |
-
print(f"Upload du dataset vers {repo_id}...")
|
55 |
-
dataset.push_to_hub(
|
56 |
-
repo_id=repo_id,
|
57 |
-
token=hf_token,
|
58 |
-
private=True,
|
59 |
-
commit_message="Test d'upload de dataset"
|
60 |
-
)
|
61 |
-
|
62 |
-
print(f"Succès! Dataset uploadé vers https://huggingface.co/datasets/{repo_id}")
|
63 |
-
return True
|
64 |
-
|
65 |
-
except Exception as e:
|
66 |
-
print(f"Erreur lors de l'upload: {str(e)}")
|
67 |
-
print("\nTraceback complet:")
|
68 |
-
import traceback
|
69 |
-
traceback.print_exc()
|
70 |
-
return False
|
71 |
-
|
72 |
-
if __name__ == "__main__":
|
73 |
-
print("=== Test d'upload vers Hugging Face Hub ===")
|
74 |
-
success = test_hf_upload()
|
75 |
-
if success:
|
76 |
-
print("\n✅ Le test a réussi! L'upload fonctionne correctement.")
|
77 |
-
else:
|
78 |
-
print("\n❌ Le test a échoué. Vérifiez les erreurs ci-dessus.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/tests/test_inference.py
DELETED
@@ -1,84 +0,0 @@
|
|
1 |
-
import time
|
2 |
-
import signal
|
3 |
-
from huggingface_hub import InferenceClient
|
4 |
-
|
5 |
-
# Configuration - Modèles et leurs providers
|
6 |
-
MODELS = [
|
7 |
-
("Qwen/Qwen2.5-72B-Instruct", "sambanova"),
|
8 |
-
("meta-llama/Llama-3.3-70B-Instruct", "sambanova"),
|
9 |
-
("deepseek-ai/DeepSeek-R1-Distill-Llama-70B", "sambanova"),
|
10 |
-
("Qwen/QwQ-32B", "novita"),
|
11 |
-
# ("mistralai/Mistral-Small-24B-Instruct-2501", "novita")
|
12 |
-
]
|
13 |
-
QUESTION = "What is the capital of France?"
|
14 |
-
TIMEOUT = 10 # secondes
|
15 |
-
|
16 |
-
|
17 |
-
class TimeoutException(Exception):
|
18 |
-
pass
|
19 |
-
|
20 |
-
|
21 |
-
def timeout_handler(signum, frame):
|
22 |
-
raise TimeoutException("Timeout")
|
23 |
-
|
24 |
-
|
25 |
-
def test_model(model, provider):
|
26 |
-
client = InferenceClient(provider=provider)
|
27 |
-
|
28 |
-
# Configure le timeout
|
29 |
-
signal.signal(signal.SIGALRM, timeout_handler)
|
30 |
-
signal.alarm(TIMEOUT)
|
31 |
-
|
32 |
-
start_time = time.time()
|
33 |
-
try:
|
34 |
-
response = client.chat_completion(
|
35 |
-
model=model,
|
36 |
-
messages=[{"role": "user", "content": QUESTION}]
|
37 |
-
)
|
38 |
-
result = response.choices[0].message.content
|
39 |
-
success = True
|
40 |
-
except TimeoutException:
|
41 |
-
result = f"TIMEOUT ({TIMEOUT}s)"
|
42 |
-
success = False
|
43 |
-
except Exception as e:
|
44 |
-
result = str(e)
|
45 |
-
success = False
|
46 |
-
finally:
|
47 |
-
# Désactive l'alarme
|
48 |
-
signal.alarm(0)
|
49 |
-
|
50 |
-
execution_time = time.time() - start_time
|
51 |
-
|
52 |
-
status = "✅" if success else "❌"
|
53 |
-
print(f"{status} {model} ({provider}) - Temps: {execution_time:.2f}s")
|
54 |
-
if success:
|
55 |
-
print(f" Réponse: {result[:80]}..." if len(result) > 80 else f" Réponse: {result}")
|
56 |
-
else:
|
57 |
-
print(f" Erreur: {result}")
|
58 |
-
|
59 |
-
return success, execution_time, result
|
60 |
-
|
61 |
-
|
62 |
-
def main():
|
63 |
-
print(f"\nTest de {len(MODELS)} modèles avec leurs providers spécifiques")
|
64 |
-
print(f"Question: {QUESTION}")
|
65 |
-
print(f"Timeout: {TIMEOUT}s\n")
|
66 |
-
|
67 |
-
results = []
|
68 |
-
for model, provider in MODELS:
|
69 |
-
success, time_taken, response = test_model(model, provider)
|
70 |
-
results.append({
|
71 |
-
"model": model,
|
72 |
-
"provider": provider,
|
73 |
-
"success": success,
|
74 |
-
"time": time_taken
|
75 |
-
})
|
76 |
-
|
77 |
-
print("\n=== RÉSUMÉ ===")
|
78 |
-
for result in results:
|
79 |
-
status = "✅" if result["success"] else "❌"
|
80 |
-
print(f"{status} {result['model']} ({result['provider']}): {result['time']:.2f}s")
|
81 |
-
|
82 |
-
|
83 |
-
if __name__ == "__main__":
|
84 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/tests/test_lighteval.py
DELETED
@@ -1,151 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
-
"""
|
3 |
-
Script minimal pour tester directement lighteval avec la tâche yourbench
|
4 |
-
"""
|
5 |
-
import os
|
6 |
-
import sys
|
7 |
-
import subprocess
|
8 |
-
import json
|
9 |
-
import time
|
10 |
-
from pathlib import Path
|
11 |
-
import logging
|
12 |
-
|
13 |
-
# Assurez-vous que l'environnement est correctement configuré
|
14 |
-
from dotenv import load_dotenv
|
15 |
-
load_dotenv()
|
16 |
-
|
17 |
-
# Importer le module de définition de tâche yourbench
|
18 |
-
sys.path.append(os.getcwd())
|
19 |
-
from tasks.yourbench_lighteval_task import create_yourbench_task
|
20 |
-
|
21 |
-
def run_lighteval_test():
|
22 |
-
"""
|
23 |
-
Exécuter un test minimal avec lighteval
|
24 |
-
"""
|
25 |
-
# Parameters
|
26 |
-
dataset_name = "yourbench_a"
|
27 |
-
organization = "yourbench"
|
28 |
-
model_name = "Qwen/Qwen2.5-72B-Instruct"
|
29 |
-
provider = "novita"
|
30 |
-
output_dir = f"uploaded_files/test_{provider}/lighteval_results"
|
31 |
-
|
32 |
-
# Créer le répertoire de sortie
|
33 |
-
os.makedirs(output_dir, exist_ok=True)
|
34 |
-
|
35 |
-
# Définir le chemin d'accès complet au dataset
|
36 |
-
dataset_path = f"{organization}/{dataset_name}"
|
37 |
-
print(f"Dataset à évaluer: {dataset_path}")
|
38 |
-
|
39 |
-
# Créer un fichier temporaire
|
40 |
-
import tempfile
|
41 |
-
temp_file_path = tempfile.mktemp(suffix=".py")
|
42 |
-
print(f"Création du fichier temporaire: {temp_file_path}")
|
43 |
-
|
44 |
-
with open(temp_file_path, 'w') as temp_file:
|
45 |
-
# Écrire le contenu du fichier
|
46 |
-
temp_file.write(f"""
|
47 |
-
import os
|
48 |
-
import sys
|
49 |
-
import logging
|
50 |
-
sys.path.append("{os.getcwd()}")
|
51 |
-
|
52 |
-
from tasks.yourbench_lighteval_task import create_yourbench_task
|
53 |
-
|
54 |
-
# Configurer le logging
|
55 |
-
logging.basicConfig(level=logging.INFO)
|
56 |
-
|
57 |
-
# Créer la tâche yourbench
|
58 |
-
yourbench = create_yourbench_task("{dataset_path}", "lighteval")
|
59 |
-
|
60 |
-
# Définir la variable TASKS_TABLE dont lighteval a besoin
|
61 |
-
TASKS_TABLE = [yourbench]
|
62 |
-
""")
|
63 |
-
|
64 |
-
# Construire la commande lighteval
|
65 |
-
cmd = [
|
66 |
-
"lighteval",
|
67 |
-
"endpoint",
|
68 |
-
"inference-providers",
|
69 |
-
f"model={model_name},provider={provider}",
|
70 |
-
"custom|yourbench|0|0",
|
71 |
-
"--custom-tasks",
|
72 |
-
temp_file_path,
|
73 |
-
"--max-samples", "5", # Seulement 1 échantillon
|
74 |
-
"--output-dir", output_dir,
|
75 |
-
"--save-details",
|
76 |
-
"--no-push-to-hub" # Pas de push pour gagner du temps
|
77 |
-
]
|
78 |
-
|
79 |
-
# Afficher la commande
|
80 |
-
print(f"Exécution de la commande: {' '.join(cmd)}")
|
81 |
-
print(f"Heure de début: {time.strftime('%H:%M:%S')}")
|
82 |
-
|
83 |
-
# Exécuter la commande
|
84 |
-
try:
|
85 |
-
# Exécuter avec capture des sorties
|
86 |
-
result = subprocess.run(cmd, capture_output=True, text=True)
|
87 |
-
|
88 |
-
# Afficher les résultats
|
89 |
-
print(f"Code de retour: {result.returncode}")
|
90 |
-
print("--- SORTIE STANDARD ---")
|
91 |
-
print(result.stdout)
|
92 |
-
print("--- ERREUR STANDARD ---")
|
93 |
-
print(result.stderr)
|
94 |
-
|
95 |
-
# Vérifier si des résultats ont été générés
|
96 |
-
results_dir = Path(output_dir) / "results"
|
97 |
-
if results_dir.exists():
|
98 |
-
print(f"Dossier de résultats créé: {results_dir}")
|
99 |
-
# Lister les fichiers de résultats
|
100 |
-
result_files = list(results_dir.glob("**/*.json"))
|
101 |
-
if result_files:
|
102 |
-
print(f"Fichiers de résultats trouvés: {result_files}")
|
103 |
-
# Trier les fichiers par date de modification pour prendre le plus récent
|
104 |
-
result_files.sort(key=lambda x: x.stat().st_mtime, reverse=True)
|
105 |
-
latest_result = result_files[0]
|
106 |
-
print(f"Fichier de résultats le plus récent: {latest_result}")
|
107 |
-
|
108 |
-
# Lire le fichier de résultats
|
109 |
-
with open(latest_result, 'r') as f:
|
110 |
-
results = json.load(f)
|
111 |
-
print("Contenu du fichier de résultats:")
|
112 |
-
print(json.dumps(results, indent=2))
|
113 |
-
|
114 |
-
# Analyse des résultats
|
115 |
-
print("\n==== ANALYSE DES RÉSULTATS ====")
|
116 |
-
if "results" in results:
|
117 |
-
for task_name, task_results in results["results"].items():
|
118 |
-
print(f"Tâche: {task_name}")
|
119 |
-
for metric_name, metric_value in task_results.items():
|
120 |
-
print(f" {metric_name}: {metric_value}")
|
121 |
-
else:
|
122 |
-
print("Aucun résultat trouvé dans le fichier JSON")
|
123 |
-
|
124 |
-
# Vérifier les détails
|
125 |
-
details_dir = Path(output_dir) / "details"
|
126 |
-
if details_dir.exists():
|
127 |
-
print(f"\nDossier de détails trouvé: {details_dir}")
|
128 |
-
model_details_dirs = list(details_dir.glob("**/*"))
|
129 |
-
if model_details_dirs:
|
130 |
-
print(f"Dossiers de détails par modèle: {model_details_dirs}")
|
131 |
-
else:
|
132 |
-
print("Aucun fichier de résultats trouvé.")
|
133 |
-
else:
|
134 |
-
print(f"Aucun dossier de résultats créé.")
|
135 |
-
|
136 |
-
except subprocess.CalledProcessError as e:
|
137 |
-
print(f"Erreur lors de l'exécution de la commande: {e}")
|
138 |
-
except Exception as e:
|
139 |
-
print(f"Exception: {e}")
|
140 |
-
finally:
|
141 |
-
# Supprimer le fichier temporaire
|
142 |
-
try:
|
143 |
-
os.unlink(temp_file_path)
|
144 |
-
print(f"Fichier temporaire supprimé: {temp_file_path}")
|
145 |
-
except:
|
146 |
-
pass
|
147 |
-
|
148 |
-
print(f"Heure de fin: {time.strftime('%H:%M:%S')}")
|
149 |
-
|
150 |
-
if __name__ == "__main__":
|
151 |
-
run_lighteval_test()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/tests/test_openai.py
DELETED
@@ -1,31 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
from openai import OpenAI
|
3 |
-
from dotenv import load_dotenv
|
4 |
-
|
5 |
-
# Load environment variables
|
6 |
-
load_dotenv()
|
7 |
-
|
8 |
-
def test_openai_connection():
|
9 |
-
try:
|
10 |
-
# Initialize OpenAI client
|
11 |
-
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
|
12 |
-
|
13 |
-
# Make a simple request
|
14 |
-
response = client.chat.completions.create(
|
15 |
-
model="gpt-3.5-turbo",
|
16 |
-
messages=[
|
17 |
-
{"role": "user", "content": "Say 'Hello World'"}
|
18 |
-
]
|
19 |
-
)
|
20 |
-
|
21 |
-
print("✅ OpenAI API connection successful!")
|
22 |
-
print(f"Response: {response.choices[0].message.content}")
|
23 |
-
return True
|
24 |
-
|
25 |
-
except Exception as e:
|
26 |
-
print("❌ OpenAI API connection failed!")
|
27 |
-
print(f"Error: {str(e)}")
|
28 |
-
return False
|
29 |
-
|
30 |
-
if __name__ == "__main__":
|
31 |
-
test_openai_connection()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/tests/test_parallel_lighteval.py
DELETED
@@ -1,278 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
-
"""
|
3 |
-
Script to run lighteval tests in parallel for multiple models
|
4 |
-
"""
|
5 |
-
import os
|
6 |
-
import sys
|
7 |
-
import json
|
8 |
-
import time
|
9 |
-
import tempfile
|
10 |
-
import asyncio
|
11 |
-
from pathlib import Path
|
12 |
-
from typing import Tuple, List, Dict, Any
|
13 |
-
|
14 |
-
# Ensure environment is properly configured
|
15 |
-
from dotenv import load_dotenv
|
16 |
-
load_dotenv()
|
17 |
-
|
18 |
-
# Import yourbench task module
|
19 |
-
sys.path.append(os.getcwd())
|
20 |
-
from tasks.yourbench_lighteval_task import create_yourbench_task
|
21 |
-
|
22 |
-
# Define models to test
|
23 |
-
INIT_MODELS = [
|
24 |
-
# 70B
|
25 |
-
("Qwen/Qwen2.5-72B-Instruct", "novita"),
|
26 |
-
("meta-llama/Llama-3.3-70B-Instruct", "novita"),
|
27 |
-
("deepseek-ai/DeepSeek-R1-Distill-Llama-70B", "novita"),
|
28 |
-
# 20 to 30B
|
29 |
-
("Qwen/QwQ-32B", "novita"),
|
30 |
-
# ("mistralai/Mistral-Small-24B-Instruct-2501", "sambanova"),
|
31 |
-
]
|
32 |
-
|
33 |
-
async def run_lighteval_test_for_model(model_info: Tuple[str, str]) -> Dict[str, Any]:
|
34 |
-
"""
|
35 |
-
Run lighteval test for a specific model
|
36 |
-
"""
|
37 |
-
model_name, provider = model_info
|
38 |
-
|
39 |
-
# Parameters
|
40 |
-
dataset_name = "yourbench_a"
|
41 |
-
organization = "yourbench"
|
42 |
-
output_dir = f"uploaded_files/test_parallel_{provider}/lighteval_results"
|
43 |
-
|
44 |
-
# Create output directory
|
45 |
-
os.makedirs(output_dir, exist_ok=True)
|
46 |
-
|
47 |
-
# Define full dataset path
|
48 |
-
dataset_path = f"{organization}/{dataset_name}"
|
49 |
-
print(f"Dataset to evaluate for {model_name}: {dataset_path}")
|
50 |
-
|
51 |
-
# Create temporary file
|
52 |
-
temp_file_path = tempfile.mktemp(suffix=".py")
|
53 |
-
print(f"Creating temporary file for {model_name}: {temp_file_path}")
|
54 |
-
|
55 |
-
with open(temp_file_path, 'w') as temp_file:
|
56 |
-
temp_file.write(f"""
|
57 |
-
import os
|
58 |
-
import sys
|
59 |
-
sys.path.append("{os.getcwd()}")
|
60 |
-
|
61 |
-
from tasks.yourbench_lighteval_task import create_yourbench_task
|
62 |
-
|
63 |
-
# Create yourbench task
|
64 |
-
yourbench = create_yourbench_task("{dataset_path}", "lighteval")
|
65 |
-
|
66 |
-
# Define TASKS_TABLE needed by lighteval
|
67 |
-
TASKS_TABLE = [yourbench]
|
68 |
-
""")
|
69 |
-
|
70 |
-
# Build lighteval command args
|
71 |
-
cmd_args = [
|
72 |
-
"lighteval",
|
73 |
-
"endpoint",
|
74 |
-
"inference-providers",
|
75 |
-
f"model={model_name},provider={provider}",
|
76 |
-
"custom|yourbench|0|0",
|
77 |
-
"--custom-tasks",
|
78 |
-
temp_file_path,
|
79 |
-
"--max-samples", "5",
|
80 |
-
"--output-dir", output_dir,
|
81 |
-
"--save-details",
|
82 |
-
"--no-push-to-hub"
|
83 |
-
]
|
84 |
-
|
85 |
-
print(f"Running command for {model_name}: {' '.join(cmd_args)}")
|
86 |
-
print(f"Start time for {model_name}: {time.strftime('%H:%M:%S')}")
|
87 |
-
|
88 |
-
results = {
|
89 |
-
"model_name": model_name,
|
90 |
-
"provider": provider,
|
91 |
-
"success": False,
|
92 |
-
"error": None,
|
93 |
-
"results": None,
|
94 |
-
"return_code": None
|
95 |
-
}
|
96 |
-
|
97 |
-
try:
|
98 |
-
# Prepare environment with needed tokens
|
99 |
-
env = os.environ.copy()
|
100 |
-
hf_token = os.getenv("HF_TOKEN")
|
101 |
-
if hf_token:
|
102 |
-
env["HF_TOKEN"] = hf_token
|
103 |
-
env["HUGGING_FACE_HUB_TOKEN"] = hf_token
|
104 |
-
env["HF_ORGANIZATION"] = organization
|
105 |
-
|
106 |
-
# Run the process asynchronously
|
107 |
-
process = await asyncio.create_subprocess_exec(
|
108 |
-
*cmd_args,
|
109 |
-
stdout=asyncio.subprocess.PIPE,
|
110 |
-
stderr=asyncio.subprocess.PIPE,
|
111 |
-
env=env
|
112 |
-
)
|
113 |
-
|
114 |
-
# Wait for the process to complete
|
115 |
-
stdout, stderr = await process.communicate()
|
116 |
-
|
117 |
-
# Store return code
|
118 |
-
exit_code = process.returncode
|
119 |
-
results["return_code"] = exit_code
|
120 |
-
|
121 |
-
# Log some output for debugging
|
122 |
-
if stdout:
|
123 |
-
stdout_lines = stdout.decode().strip().split('\n')
|
124 |
-
if stdout_lines and len(stdout_lines) > 0:
|
125 |
-
print(f"Output from {model_name}: {stdout_lines[0]}")
|
126 |
-
|
127 |
-
# Check if results were generated
|
128 |
-
results_dir = Path(output_dir) / "results"
|
129 |
-
if results_dir.exists():
|
130 |
-
result_files = list(results_dir.glob("**/*.json"))
|
131 |
-
if result_files:
|
132 |
-
# Read the first results file
|
133 |
-
with open(result_files[0], 'r') as f:
|
134 |
-
test_results = json.load(f)
|
135 |
-
results["results"] = test_results
|
136 |
-
results["success"] = True
|
137 |
-
|
138 |
-
except asyncio.CancelledError:
|
139 |
-
results["error"] = "Task cancelled"
|
140 |
-
print(f"Task cancelled for {model_name}")
|
141 |
-
except Exception as e:
|
142 |
-
results["error"] = f"Exception: {str(e)}"
|
143 |
-
print(f"Error running test for {model_name}: {str(e)}")
|
144 |
-
finally:
|
145 |
-
# Delete temporary file
|
146 |
-
try:
|
147 |
-
os.unlink(temp_file_path)
|
148 |
-
except:
|
149 |
-
pass
|
150 |
-
|
151 |
-
print(f"End time for {model_name}: {time.strftime('%H:%M:%S')}")
|
152 |
-
return results
|
153 |
-
|
154 |
-
async def run_parallel_tests(models: List[Tuple[str, str]]) -> List[Dict[str, Any]]:
|
155 |
-
"""
|
156 |
-
Run tests in parallel for multiple models using asyncio
|
157 |
-
"""
|
158 |
-
print(f"Starting parallel tests for {len(models)} models")
|
159 |
-
|
160 |
-
# Create tasks for each model
|
161 |
-
tasks = [run_lighteval_test_for_model(model) for model in models]
|
162 |
-
|
163 |
-
# Run all tasks concurrently and gather results
|
164 |
-
model_results = await asyncio.gather(*tasks, return_exceptions=True)
|
165 |
-
|
166 |
-
# Process results
|
167 |
-
results = []
|
168 |
-
for i, result in enumerate(model_results):
|
169 |
-
if isinstance(result, Exception):
|
170 |
-
# Handle exception
|
171 |
-
model_name, provider = models[i]
|
172 |
-
print(f"Test failed for {model_name}: {str(result)}")
|
173 |
-
results.append({
|
174 |
-
"model_name": model_name,
|
175 |
-
"provider": provider,
|
176 |
-
"success": False,
|
177 |
-
"error": str(result),
|
178 |
-
"results": None,
|
179 |
-
"return_code": None
|
180 |
-
})
|
181 |
-
else:
|
182 |
-
# Valid result
|
183 |
-
results.append(result)
|
184 |
-
print(f"Test completed for {result['model_name']}")
|
185 |
-
|
186 |
-
return results
|
187 |
-
|
188 |
-
def format_comparison_results(results: List[Dict[str, Any]]) -> Dict[str, Any]:
|
189 |
-
"""
|
190 |
-
Format results for easy comparison between models
|
191 |
-
"""
|
192 |
-
comparison = {
|
193 |
-
"metadata": {
|
194 |
-
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
195 |
-
"total_models_tested": len(results),
|
196 |
-
"successful_tests": len([r for r in results if r["success"]])
|
197 |
-
},
|
198 |
-
"models_comparison": []
|
199 |
-
}
|
200 |
-
|
201 |
-
# Sort models by accuracy (if available) or name
|
202 |
-
sorted_results = sorted(
|
203 |
-
results,
|
204 |
-
key=lambda x: (
|
205 |
-
x["results"]["results"]["all"]["accuracy"] if x["success"] and x["results"] else -1,
|
206 |
-
x["model_name"]
|
207 |
-
),
|
208 |
-
reverse=True
|
209 |
-
)
|
210 |
-
|
211 |
-
for result in sorted_results:
|
212 |
-
model_result = {
|
213 |
-
"model_name": result["model_name"],
|
214 |
-
"provider": result["provider"],
|
215 |
-
"success": result["success"]
|
216 |
-
}
|
217 |
-
|
218 |
-
if result["success"] and result["results"]:
|
219 |
-
model_result.update({
|
220 |
-
"accuracy": result["results"]["results"]["all"]["accuracy"],
|
221 |
-
"accuracy_stderr": result["results"]["results"]["all"]["accuracy_stderr"],
|
222 |
-
"evaluation_time": float(result["results"]["config_general"]["total_evaluation_time_secondes"])
|
223 |
-
})
|
224 |
-
else:
|
225 |
-
model_result["error"] = result["error"]
|
226 |
-
|
227 |
-
comparison["models_comparison"].append(model_result)
|
228 |
-
|
229 |
-
return comparison
|
230 |
-
|
231 |
-
async def main_async():
|
232 |
-
"""
|
233 |
-
Async main function to run parallel tests
|
234 |
-
"""
|
235 |
-
print("Starting parallel lighteval tests")
|
236 |
-
start_time = time.time()
|
237 |
-
|
238 |
-
# Run tests in parallel
|
239 |
-
results = await run_parallel_tests(INIT_MODELS)
|
240 |
-
|
241 |
-
# Save detailed results
|
242 |
-
detailed_output_file = "parallel_test_detailed_results.json"
|
243 |
-
with open(detailed_output_file, 'w') as f:
|
244 |
-
json.dump(results, f, indent=2)
|
245 |
-
|
246 |
-
# Generate and save comparison results
|
247 |
-
comparison = format_comparison_results(results)
|
248 |
-
comparison_file = "models_comparison.json"
|
249 |
-
with open(comparison_file, 'w') as f:
|
250 |
-
json.dump(comparison, f, indent=2)
|
251 |
-
|
252 |
-
# Print summary
|
253 |
-
print("\nTest Summary:")
|
254 |
-
for model in comparison["models_comparison"]:
|
255 |
-
status = "✅" if model["success"] else "❌"
|
256 |
-
print(f"{status} {model['model_name']} ({model['provider']})")
|
257 |
-
if not model["success"]:
|
258 |
-
print(f" Error: {model['error']}")
|
259 |
-
else:
|
260 |
-
print(f" Accuracy: {model['accuracy']:.2%} (±{model['accuracy_stderr']:.2%})")
|
261 |
-
print(f" Evaluation time: {model['evaluation_time']:.2f}s")
|
262 |
-
|
263 |
-
duration = time.time() - start_time
|
264 |
-
print(f"\nTotal execution time: {duration:.2f} seconds")
|
265 |
-
print(f"Detailed results saved to: {detailed_output_file}")
|
266 |
-
print(f"Comparison results saved to: {comparison_file}")
|
267 |
-
|
268 |
-
def main():
|
269 |
-
"""
|
270 |
-
Main function to run parallel tests
|
271 |
-
"""
|
272 |
-
# Create event loop and run the async main
|
273 |
-
loop = asyncio.get_event_loop()
|
274 |
-
loop.run_until_complete(main_async())
|
275 |
-
loop.close()
|
276 |
-
|
277 |
-
if __name__ == "__main__":
|
278 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/tests/test_provider_parallel_support.py
DELETED
@@ -1,227 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
-
"""
|
3 |
-
Script pour tester si un fournisseur d'API supporte réellement les requêtes parallèles
|
4 |
-
"""
|
5 |
-
import os
|
6 |
-
import sys
|
7 |
-
import time
|
8 |
-
import asyncio
|
9 |
-
import json
|
10 |
-
from pathlib import Path
|
11 |
-
from datetime import datetime
|
12 |
-
|
13 |
-
# Ensure environment is properly configured
|
14 |
-
from dotenv import load_dotenv
|
15 |
-
load_dotenv()
|
16 |
-
|
17 |
-
# Définir le modèle et le fournisseur à tester
|
18 |
-
MODEL_NAME = "Qwen/QwQ-32B"
|
19 |
-
PROVIDER = "novita"
|
20 |
-
REQUEST_COUNT = 5 # Nombre de requêtes
|
21 |
-
|
22 |
-
# Liste de questions
|
23 |
-
PROMPTS = [
|
24 |
-
"Explain in detail how parallel computing has transformed modern data processing.",
|
25 |
-
"Describe the fundamental differences between CPU and GPU architectures.",
|
26 |
-
"Analyze the key challenges in distributed systems design.",
|
27 |
-
"Discuss the evolution of natural language processing from rule-based systems to modern transformer architectures.",
|
28 |
-
"Explain the concept of quantum computing and how it differs from classical computing paradigms."
|
29 |
-
]
|
30 |
-
|
31 |
-
async def send_request(prompt, request_id=None, show_logs=True):
|
32 |
-
"""Envoie une requête au modèle et mesure le temps d'exécution"""
|
33 |
-
if show_logs and request_id is not None:
|
34 |
-
print(f"Démarrage requête {request_id} à {datetime.now().strftime('%H:%M:%S.%f')[:-3]}")
|
35 |
-
|
36 |
-
start_time = time.time()
|
37 |
-
|
38 |
-
cmd_args = [
|
39 |
-
"curl", "-s",
|
40 |
-
"-X", "POST",
|
41 |
-
f"https://api-inference.huggingface.co/models/{MODEL_NAME}",
|
42 |
-
"-H", f"Authorization: Bearer {os.environ.get('HF_TOKEN')}",
|
43 |
-
"-H", "Content-Type: application/json",
|
44 |
-
"-d", json.dumps({
|
45 |
-
"inputs": prompt,
|
46 |
-
"parameters": {
|
47 |
-
"provider": PROVIDER,
|
48 |
-
"max_new_tokens": 20
|
49 |
-
}
|
50 |
-
})
|
51 |
-
]
|
52 |
-
|
53 |
-
process = await asyncio.create_subprocess_exec(
|
54 |
-
*cmd_args,
|
55 |
-
stdout=asyncio.subprocess.PIPE,
|
56 |
-
stderr=asyncio.subprocess.PIPE
|
57 |
-
)
|
58 |
-
|
59 |
-
stdout, stderr = await process.communicate()
|
60 |
-
|
61 |
-
end_time = time.time()
|
62 |
-
duration = end_time - start_time
|
63 |
-
|
64 |
-
response = stdout.decode("utf-8")
|
65 |
-
stderr_output = stderr.decode("utf-8")
|
66 |
-
|
67 |
-
# Déterminer le succès
|
68 |
-
is_success = False
|
69 |
-
try:
|
70 |
-
response_json = json.loads(response)
|
71 |
-
is_success = process.returncode == 0 and isinstance(response_json, list) and "generated_text" in response_json[0]
|
72 |
-
except json.JSONDecodeError:
|
73 |
-
is_success = process.returncode == 0 and not ("error" in response.lower())
|
74 |
-
except Exception:
|
75 |
-
is_success = process.returncode == 0
|
76 |
-
|
77 |
-
# Extraire message d'erreur si échec
|
78 |
-
error_message = None
|
79 |
-
if not is_success:
|
80 |
-
try:
|
81 |
-
if "error" in response.lower():
|
82 |
-
try:
|
83 |
-
response_json = json.loads(response)
|
84 |
-
if "error" in response_json:
|
85 |
-
error_message = response_json["error"]
|
86 |
-
except:
|
87 |
-
error_message = f"Erreur non-JSON: {response}"
|
88 |
-
elif stderr_output:
|
89 |
-
error_message = stderr_output
|
90 |
-
else:
|
91 |
-
error_message = f"Réponse: {response}"
|
92 |
-
except:
|
93 |
-
error_message = f"Erreur inconnue. Code: {process.returncode}"
|
94 |
-
|
95 |
-
if show_logs and request_id is not None:
|
96 |
-
print(f"Fin requête {request_id} à {datetime.now().strftime('%H:%M:%S.%f')[:-3]} (durée: {duration:.2f}s)")
|
97 |
-
if not is_success:
|
98 |
-
print(f"ERREUR requête {request_id}: {error_message[:100]}..." if error_message and len(error_message) > 100 else error_message)
|
99 |
-
|
100 |
-
return {
|
101 |
-
"request_id": request_id,
|
102 |
-
"prompt": prompt,
|
103 |
-
"start_time": start_time,
|
104 |
-
"end_time": end_time,
|
105 |
-
"duration": duration,
|
106 |
-
"success": is_success,
|
107 |
-
"response": response,
|
108 |
-
"error_message": error_message
|
109 |
-
}
|
110 |
-
|
111 |
-
async def run_parallel_requests(prompts):
|
112 |
-
"""Exécute les requêtes en parallèle"""
|
113 |
-
print(f"\n=== Test parallèle: {len(prompts)} requêtes pour {MODEL_NAME} ({PROVIDER}) ===")
|
114 |
-
print(f"Heure de début: {datetime.now().strftime('%H:%M:%S')}")
|
115 |
-
|
116 |
-
# Synchroniser le démarrage des requêtes
|
117 |
-
start_event = asyncio.Event()
|
118 |
-
|
119 |
-
async def synchronized_request(prompt, req_id):
|
120 |
-
await start_event.wait()
|
121 |
-
return await send_request(prompt, req_id)
|
122 |
-
|
123 |
-
# Créer toutes les tâches
|
124 |
-
tasks = [asyncio.create_task(synchronized_request(prompts[i], i)) for i in range(len(prompts))]
|
125 |
-
|
126 |
-
# Attendre que toutes les tâches soient prêtes
|
127 |
-
await asyncio.sleep(1)
|
128 |
-
|
129 |
-
# Lancer toutes les requêtes en même temps
|
130 |
-
parallel_start_time = time.time()
|
131 |
-
print(f"Démarrage synchronisé à {datetime.now().strftime('%H:%M:%S.%f')[:-3]}")
|
132 |
-
start_event.set()
|
133 |
-
|
134 |
-
# Attendre que toutes les tâches se terminent
|
135 |
-
results = await asyncio.gather(*tasks)
|
136 |
-
parallel_end_time = time.time()
|
137 |
-
parallel_duration = parallel_end_time - parallel_start_time
|
138 |
-
|
139 |
-
print(f"Test parallèle terminé en {parallel_duration:.2f}s\n")
|
140 |
-
return results, parallel_duration
|
141 |
-
|
142 |
-
async def run_sequential_requests(prompts):
|
143 |
-
"""Exécute les mêmes requêtes séquentiellement"""
|
144 |
-
print(f"\n=== Test séquentiel: {len(prompts)} requêtes pour {MODEL_NAME} ({PROVIDER}) ===")
|
145 |
-
print(f"Heure de début: {datetime.now().strftime('%H:%M:%S')}")
|
146 |
-
|
147 |
-
sequential_start_time = time.time()
|
148 |
-
results = []
|
149 |
-
|
150 |
-
for i, prompt in enumerate(prompts):
|
151 |
-
print(f"Requête séquentielle {i}...")
|
152 |
-
result = await send_request(prompt, i)
|
153 |
-
results.append(result)
|
154 |
-
|
155 |
-
sequential_end_time = time.time()
|
156 |
-
sequential_duration = sequential_end_time - sequential_start_time
|
157 |
-
|
158 |
-
print(f"Test séquentiel terminé en {sequential_duration:.2f}s\n")
|
159 |
-
return results, sequential_duration
|
160 |
-
|
161 |
-
async def run_tests():
|
162 |
-
"""Exécute les tests parallèles puis séquentiels et compare les résultats"""
|
163 |
-
global_start = time.time()
|
164 |
-
prompts = PROMPTS[:REQUEST_COUNT] # Utiliser le nombre de prompts spécifié
|
165 |
-
|
166 |
-
# 1. Test parallèle
|
167 |
-
parallel_results, parallel_duration = await run_parallel_requests(prompts)
|
168 |
-
|
169 |
-
# 2. Test séquentiel
|
170 |
-
sequential_results, sequential_duration = await run_sequential_requests(prompts)
|
171 |
-
|
172 |
-
# 3. Analyser les résultats
|
173 |
-
global_end = time.time()
|
174 |
-
total_duration = global_end - global_start
|
175 |
-
|
176 |
-
# Calculer les métriques
|
177 |
-
parallel_success = sum(1 for r in parallel_results if r["success"])
|
178 |
-
sequential_success = sum(1 for r in sequential_results if r["success"])
|
179 |
-
|
180 |
-
# Calculer le facteur de parallélisme réel (temps séquentiel / temps parallèle)
|
181 |
-
if parallel_duration > 0:
|
182 |
-
parallelism_factor = sequential_duration / parallel_duration
|
183 |
-
else:
|
184 |
-
parallelism_factor = 0
|
185 |
-
|
186 |
-
# Pourcentage d'amélioration
|
187 |
-
improvement_percent = (1 - (parallel_duration / sequential_duration)) * 100 if sequential_duration > 0 else 0
|
188 |
-
|
189 |
-
# Afficher le résumé
|
190 |
-
print("\n====== RÉSUMÉ DES TESTS ======")
|
191 |
-
print(f"Modèle: {MODEL_NAME}, Provider: {PROVIDER}, Requêtes: {len(prompts)}")
|
192 |
-
print(f"\nDurée test parallèle: {parallel_duration:.2f}s ({parallel_success}/{len(prompts)} réussies)")
|
193 |
-
print(f"Durée test séquentiel: {sequential_duration:.2f}s ({sequential_success}/{len(prompts)} réussies)")
|
194 |
-
print(f"Facteur de parallélisme: {parallelism_factor:.2f}x")
|
195 |
-
print(f"Amélioration: {improvement_percent:.1f}%")
|
196 |
-
|
197 |
-
if parallelism_factor >= len(prompts) * 0.8:
|
198 |
-
conclusion = "EXCELLENT parallélisme (proche du théorique maximum)"
|
199 |
-
elif parallelism_factor >= 2:
|
200 |
-
conclusion = "BON parallélisme (significativement meilleur que séquentiel)"
|
201 |
-
elif parallelism_factor >= 1.3:
|
202 |
-
conclusion = "MOYEN parallélisme (légèrement meilleur que séquentiel)"
|
203 |
-
else:
|
204 |
-
conclusion = "FAIBLE ou PAS DE parallélisme (pas d'avantage significatif)"
|
205 |
-
|
206 |
-
print(f"\nConclusion: {conclusion}")
|
207 |
-
|
208 |
-
# Enregistrer les résultats
|
209 |
-
output_file = f"parallel_test_{PROVIDER}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
210 |
-
with open(output_file, 'w') as f:
|
211 |
-
json.dump({
|
212 |
-
"model": MODEL_NAME,
|
213 |
-
"provider": PROVIDER,
|
214 |
-
"request_count": len(prompts),
|
215 |
-
"parallel_duration": parallel_duration,
|
216 |
-
"sequential_duration": sequential_duration,
|
217 |
-
"parallelism_factor": parallelism_factor,
|
218 |
-
"improvement_percent": improvement_percent,
|
219 |
-
"conclusion": conclusion,
|
220 |
-
"parallel_results": parallel_results,
|
221 |
-
"sequential_results": sequential_results
|
222 |
-
}, f, indent=2)
|
223 |
-
|
224 |
-
print(f"\nRésultats détaillés sauvegardés dans {output_file}")
|
225 |
-
|
226 |
-
if __name__ == "__main__":
|
227 |
-
asyncio.run(run_tests())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/tests/test_yourbench_results.py
DELETED
@@ -1,394 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
-
"""
|
3 |
-
Script pour tester les résultats de Yourbench et vérifier les datasets sur le Hub Hugging Face.
|
4 |
-
"""
|
5 |
-
|
6 |
-
import os
|
7 |
-
import sys
|
8 |
-
import json
|
9 |
-
import argparse
|
10 |
-
import requests
|
11 |
-
import tempfile
|
12 |
-
from datetime import datetime
|
13 |
-
from typing import Dict, List, Any, Optional, Tuple
|
14 |
-
|
15 |
-
# Vérifier si les bibliothèques nécessaires sont installées
|
16 |
-
try:
|
17 |
-
from dotenv import load_dotenv
|
18 |
-
from huggingface_hub import HfApi, DatasetInfo, ModelInfo
|
19 |
-
from loguru import logger
|
20 |
-
import pandas as pd
|
21 |
-
except ImportError:
|
22 |
-
print("Installation des dépendances...")
|
23 |
-
import subprocess
|
24 |
-
subprocess.run(["pip", "install", "python-dotenv", "huggingface_hub", "loguru", "pandas", "pyarrow"], check=True)
|
25 |
-
from dotenv import load_dotenv
|
26 |
-
from huggingface_hub import HfApi, DatasetInfo, ModelInfo
|
27 |
-
from loguru import logger
|
28 |
-
import pandas as pd
|
29 |
-
|
30 |
-
# Charger les variables d'environnement depuis .env
|
31 |
-
load_dotenv()
|
32 |
-
|
33 |
-
# Configuration de la journalisation
|
34 |
-
logger.remove()
|
35 |
-
logger.add(sys.stderr, format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>")
|
36 |
-
logger.add("yourbench_tests.log", rotation="10 MB", retention="1 week")
|
37 |
-
|
38 |
-
def configure_argument_parser() -> argparse.ArgumentParser:
|
39 |
-
"""Configure le parser d'arguments."""
|
40 |
-
parser = argparse.ArgumentParser(description="Tester les résultats de Yourbench et vérifier les datasets")
|
41 |
-
parser.add_argument("--dataset", type=str, help="Nom du dataset à vérifier (sans le nom de l'organisation)")
|
42 |
-
parser.add_argument("--org", type=str, default=os.environ.get("HF_ORGANIZATION", "yourbench"),
|
43 |
-
help="Organisation Hugging Face (défaut: valeur de HF_ORGANIZATION dans .env ou 'yourbench')")
|
44 |
-
parser.add_argument("--verbose", "-v", action="store_true", help="Afficher des informations détaillées")
|
45 |
-
return parser
|
46 |
-
|
47 |
-
class YourbenchTester:
|
48 |
-
"""Classe pour tester les résultats et datasets de Yourbench."""
|
49 |
-
|
50 |
-
def __init__(self, organization: str, verbose: bool = False):
|
51 |
-
"""Initialise le testeur Yourbench.
|
52 |
-
|
53 |
-
Args:
|
54 |
-
organization: Nom de l'organisation sur Hugging Face
|
55 |
-
verbose: Afficher des informations détaillées
|
56 |
-
"""
|
57 |
-
self.organization = organization
|
58 |
-
self.verbose = verbose
|
59 |
-
self.hf_token = os.environ.get("HF_TOKEN")
|
60 |
-
|
61 |
-
if not self.hf_token:
|
62 |
-
logger.error("Variable d'environnement HF_TOKEN non trouvée dans le fichier .env")
|
63 |
-
sys.exit(1)
|
64 |
-
|
65 |
-
self.api = HfApi(token=self.hf_token)
|
66 |
-
logger.info(f"Initialisation du testeur pour l'organisation: {organization}")
|
67 |
-
|
68 |
-
def test_dataset_exists(self, dataset_name: str) -> Optional[DatasetInfo]:
|
69 |
-
"""Vérifie si un dataset existe sur le Hub.
|
70 |
-
|
71 |
-
Args:
|
72 |
-
dataset_name: Nom du dataset à vérifier
|
73 |
-
|
74 |
-
Returns:
|
75 |
-
Informations sur le dataset s'il existe, None sinon
|
76 |
-
"""
|
77 |
-
full_dataset_name = f"{self.organization}/{dataset_name}"
|
78 |
-
logger.info(f"Vérification de l'existence du dataset: {full_dataset_name}")
|
79 |
-
|
80 |
-
try:
|
81 |
-
dataset_info = self.api.dataset_info(full_dataset_name)
|
82 |
-
logger.success(f"Dataset {full_dataset_name} trouvé!")
|
83 |
-
|
84 |
-
if self.verbose:
|
85 |
-
logger.info(f"ID: {dataset_info.id}")
|
86 |
-
logger.info(f"Dernière modification: {dataset_info.lastModified}")
|
87 |
-
logger.info(f"SHA: {dataset_info.sha}")
|
88 |
-
|
89 |
-
return dataset_info
|
90 |
-
|
91 |
-
except Exception as e:
|
92 |
-
logger.error(f"Impossible de trouver le dataset {full_dataset_name}: {str(e)}")
|
93 |
-
return None
|
94 |
-
|
95 |
-
def analyze_dataset_content(self, dataset_name: str) -> Tuple[bool, Dict[str, Any]]:
|
96 |
-
"""Analyse le contenu d'un dataset.
|
97 |
-
|
98 |
-
Args:
|
99 |
-
dataset_name: Nom du dataset à analyser
|
100 |
-
|
101 |
-
Returns:
|
102 |
-
Tuple contenant un booléen indiquant si l'analyse a réussi et un dictionnaire de statistiques
|
103 |
-
"""
|
104 |
-
full_dataset_name = f"{self.organization}/{dataset_name}"
|
105 |
-
logger.info(f"Analyse du contenu du dataset: {full_dataset_name}")
|
106 |
-
|
107 |
-
stats = {
|
108 |
-
"fichiers": 0,
|
109 |
-
"taille_totale": 0,
|
110 |
-
"fichiers_json": 0,
|
111 |
-
"fichiers_parquet": 0,
|
112 |
-
"a_questions": False,
|
113 |
-
"nb_questions": 0,
|
114 |
-
"structure_parquet": {},
|
115 |
-
"types_documents": set()
|
116 |
-
}
|
117 |
-
|
118 |
-
try:
|
119 |
-
# Lister les fichiers dans le dataset
|
120 |
-
files = self.api.list_repo_files(full_dataset_name, repo_type="dataset")
|
121 |
-
stats["fichiers"] = len(files)
|
122 |
-
|
123 |
-
if self.verbose:
|
124 |
-
logger.info(f"Fichiers trouvés dans le dataset: {len(files)}")
|
125 |
-
for file in files[:10]: # Limiter à 10 fichiers pour éviter un affichage trop verbeux
|
126 |
-
logger.info(f" - {file}")
|
127 |
-
if len(files) > 10:
|
128 |
-
logger.info(f" ... et {len(files) - 10} fichiers supplémentaires")
|
129 |
-
|
130 |
-
# Vérifier la présence de fichiers questions
|
131 |
-
question_files = [f for f in files if "question" in f.lower() and f.endswith(".json")]
|
132 |
-
stats["fichiers_json"] = len([f for f in files if f.endswith(".json")])
|
133 |
-
|
134 |
-
# Vérifier les fichiers Parquet qui sont utilisés par Yourbench
|
135 |
-
parquet_files = [f for f in files if f.endswith(".parquet")]
|
136 |
-
stats["fichiers_parquet"] = len(parquet_files)
|
137 |
-
|
138 |
-
if parquet_files:
|
139 |
-
logger.info(f"Fichiers Parquet trouvés: {len(parquet_files)}")
|
140 |
-
|
141 |
-
# Analyser un échantillon de fichiers Parquet
|
142 |
-
for parquet_file in parquet_files[:3]: # Limiter à 3 fichiers pour l'analyse
|
143 |
-
category = parquet_file.split('/')[0] if '/' in parquet_file else "unknown"
|
144 |
-
|
145 |
-
logger.info(f"Analyse du fichier Parquet: {parquet_file} (catégorie: {category})")
|
146 |
-
|
147 |
-
try:
|
148 |
-
# Télécharger le fichier Parquet
|
149 |
-
temp_file = self.api.hf_hub_download(
|
150 |
-
repo_id=full_dataset_name,
|
151 |
-
filename=parquet_file,
|
152 |
-
repo_type="dataset"
|
153 |
-
)
|
154 |
-
|
155 |
-
# Lire le fichier Parquet avec pandas
|
156 |
-
df = pd.read_parquet(temp_file)
|
157 |
-
|
158 |
-
# Ajouter des statistiques
|
159 |
-
stats["structure_parquet"][category] = {
|
160 |
-
"colonnes": list(df.columns),
|
161 |
-
"nb_lignes": len(df),
|
162 |
-
"exemple": df.iloc[0].to_dict() if len(df) > 0 else {}
|
163 |
-
}
|
164 |
-
|
165 |
-
# Vérifier si ce fichier contient des questions
|
166 |
-
if any(col for col in df.columns if "question" in col.lower()):
|
167 |
-
stats["a_questions"] = True
|
168 |
-
question_col = next(col for col in df.columns if "question" in col.lower())
|
169 |
-
stats["nb_questions"] = len(df)
|
170 |
-
|
171 |
-
# Récupérer un exemple de question
|
172 |
-
if len(df) > 0 and question_col in df.columns:
|
173 |
-
logger.info(f"Exemple de question: {df[question_col].iloc[0][:100]}...")
|
174 |
-
|
175 |
-
# Identifier les types de documents si disponible
|
176 |
-
if "doc_type" in df.columns and len(df) > 0:
|
177 |
-
doc_types = df["doc_type"].unique()
|
178 |
-
stats["types_documents"].update(doc_types)
|
179 |
-
|
180 |
-
except Exception as e:
|
181 |
-
logger.warning(f"Erreur lors de l'analyse du fichier {parquet_file}: {str(e)}")
|
182 |
-
|
183 |
-
# Convertir le set en liste pour la sérialisation JSON
|
184 |
-
stats["types_documents"] = list(stats["types_documents"])
|
185 |
-
|
186 |
-
if question_files:
|
187 |
-
stats["a_questions"] = True
|
188 |
-
|
189 |
-
# Analyser un fichier de questions pour comprendre sa structure
|
190 |
-
sample_file = question_files[0]
|
191 |
-
content = self.api.hf_hub_download(
|
192 |
-
repo_id=full_dataset_name,
|
193 |
-
filename=sample_file,
|
194 |
-
repo_type="dataset"
|
195 |
-
)
|
196 |
-
|
197 |
-
with open(content, 'r') as f:
|
198 |
-
data = json.load(f)
|
199 |
-
|
200 |
-
if isinstance(data, list):
|
201 |
-
stats["nb_questions"] = len(data)
|
202 |
-
elif isinstance(data, dict) and "questions" in data:
|
203 |
-
stats["nb_questions"] = len(data["questions"])
|
204 |
-
|
205 |
-
logger.success(f"Fichiers de questions trouvés: {len(question_files)}")
|
206 |
-
logger.info(f"Exemple de fichier analysé: {sample_file}")
|
207 |
-
logger.info(f"Nombre de questions trouvées: {stats['nb_questions']}")
|
208 |
-
|
209 |
-
return True, stats
|
210 |
-
|
211 |
-
except Exception as e:
|
212 |
-
logger.error(f"Erreur lors de l'analyse du dataset {full_dataset_name}: {str(e)}")
|
213 |
-
return False, stats
|
214 |
-
|
215 |
-
def check_evaluation_results(self, dataset_name: str) -> bool:
|
216 |
-
"""Vérifie s'il existe des résultats d'évaluation pour ce dataset.
|
217 |
-
|
218 |
-
Args:
|
219 |
-
dataset_name: Nom du dataset à vérifier
|
220 |
-
|
221 |
-
Returns:
|
222 |
-
True si des résultats d'évaluation existent, False sinon
|
223 |
-
"""
|
224 |
-
logger.info(f"Recherche de résultats d'évaluation pour le dataset: {dataset_name}")
|
225 |
-
|
226 |
-
try:
|
227 |
-
# Lister tous les datasets de l'organisation
|
228 |
-
datasets = self.api.list_datasets(author=self.organization)
|
229 |
-
|
230 |
-
# Chercher les datasets d'évaluation
|
231 |
-
eval_datasets = [ds for ds in datasets if ds.id.startswith(f"{self.organization}/evaluation-")]
|
232 |
-
|
233 |
-
if self.verbose:
|
234 |
-
logger.info(f"Datasets d'évaluation trouvés: {len(eval_datasets)}")
|
235 |
-
for ds in eval_datasets[:5]:
|
236 |
-
logger.info(f" - {ds.id}")
|
237 |
-
|
238 |
-
# Vérifier si le dataset spécifié est mentionné dans les évaluations
|
239 |
-
for eval_ds in eval_datasets:
|
240 |
-
try:
|
241 |
-
# Télécharger le README pour voir si le dataset est mentionné
|
242 |
-
readme_path = self.api.hf_hub_download(
|
243 |
-
repo_id=eval_ds.id,
|
244 |
-
filename="README.md",
|
245 |
-
repo_type="dataset"
|
246 |
-
)
|
247 |
-
|
248 |
-
with open(readme_path, 'r') as f:
|
249 |
-
readme_content = f.read()
|
250 |
-
|
251 |
-
if dataset_name in readme_content:
|
252 |
-
logger.success(f"Résultats d'évaluation trouvés dans: {eval_ds.id}")
|
253 |
-
return True
|
254 |
-
except:
|
255 |
-
continue
|
256 |
-
|
257 |
-
logger.warning(f"Aucun résultat d'évaluation trouvé pour le dataset: {dataset_name}")
|
258 |
-
return False
|
259 |
-
|
260 |
-
except Exception as e:
|
261 |
-
logger.error(f"Erreur lors de la recherche de résultats d'évaluation: {str(e)}")
|
262 |
-
return False
|
263 |
-
|
264 |
-
def check_model_performances(self, dataset_name: str) -> Dict[str, float]:
|
265 |
-
"""Vérifie les performances des modèles sur le dataset spécifié.
|
266 |
-
|
267 |
-
Args:
|
268 |
-
dataset_name: Nom du dataset à vérifier
|
269 |
-
|
270 |
-
Returns:
|
271 |
-
Dictionnaire des performances des modèles (model_name -> score)
|
272 |
-
"""
|
273 |
-
logger.info(f"Vérification des performances des modèles sur le dataset: {dataset_name}")
|
274 |
-
performances = {}
|
275 |
-
|
276 |
-
try:
|
277 |
-
# Cette partie est spéculative car nous ne connaissons pas la structure exacte
|
278 |
-
# des résultats. Une approche possible serait de chercher des fichiers JSON
|
279 |
-
# contenant des métriques dans les datasets d'évaluation.
|
280 |
-
|
281 |
-
# Chercher les datasets d'évaluation
|
282 |
-
datasets = self.api.list_datasets(author=self.organization)
|
283 |
-
eval_datasets = [ds for ds in datasets if ds.id.startswith(f"{self.organization}/evaluation-")]
|
284 |
-
|
285 |
-
for eval_ds in eval_datasets:
|
286 |
-
try:
|
287 |
-
files = self.api.list_repo_files(eval_ds.id, repo_type="dataset")
|
288 |
-
result_files = [f for f in files if "result" in f.lower() and f.endswith(".json")]
|
289 |
-
|
290 |
-
for result_file in result_files:
|
291 |
-
file_path = self.api.hf_hub_download(
|
292 |
-
repo_id=eval_ds.id,
|
293 |
-
filename=result_file,
|
294 |
-
repo_type="dataset"
|
295 |
-
)
|
296 |
-
|
297 |
-
with open(file_path, 'r') as f:
|
298 |
-
results = json.load(f)
|
299 |
-
|
300 |
-
# Analyse basique des résultats (à adapter selon la structure réelle)
|
301 |
-
if "model_name" in results and "metrics" in results:
|
302 |
-
model_name = results["model_name"]
|
303 |
-
metrics = results["metrics"]
|
304 |
-
|
305 |
-
# Prendre la première métrique trouvée comme score
|
306 |
-
if metrics and isinstance(metrics, dict):
|
307 |
-
first_metric = list(metrics.keys())[0]
|
308 |
-
performances[model_name] = metrics[first_metric]
|
309 |
-
except:
|
310 |
-
continue
|
311 |
-
|
312 |
-
if performances:
|
313 |
-
logger.success(f"Performances trouvées pour {len(performances)} modèles")
|
314 |
-
for model, score in performances.items():
|
315 |
-
logger.info(f" - {model}: {score}")
|
316 |
-
else:
|
317 |
-
logger.warning("Aucune performance de modèle trouvée")
|
318 |
-
|
319 |
-
return performances
|
320 |
-
|
321 |
-
except Exception as e:
|
322 |
-
logger.error(f"Erreur lors de la vérification des performances: {str(e)}")
|
323 |
-
return {}
|
324 |
-
|
325 |
-
def main():
|
326 |
-
"""Fonction principale."""
|
327 |
-
parser = configure_argument_parser()
|
328 |
-
args = parser.parse_args()
|
329 |
-
|
330 |
-
if not args.dataset:
|
331 |
-
logger.error("Veuillez spécifier un dataset avec --dataset")
|
332 |
-
parser.print_help()
|
333 |
-
return
|
334 |
-
|
335 |
-
# Créer le testeur
|
336 |
-
tester = YourbenchTester(args.org, args.verbose)
|
337 |
-
|
338 |
-
# 1. Vérifier l'existence du dataset
|
339 |
-
dataset_info = tester.test_dataset_exists(args.dataset)
|
340 |
-
|
341 |
-
if not dataset_info:
|
342 |
-
logger.error(f"Le dataset {args.org}/{args.dataset} n'existe pas ou n'est pas accessible")
|
343 |
-
return
|
344 |
-
|
345 |
-
# 2. Analyser le contenu du dataset
|
346 |
-
success, stats = tester.analyze_dataset_content(args.dataset)
|
347 |
-
|
348 |
-
if success:
|
349 |
-
logger.info("\n=== Statistiques du dataset ===")
|
350 |
-
logger.info(f"Nombre de fichiers: {stats['fichiers']}")
|
351 |
-
logger.info(f"Fichiers JSON: {stats['fichiers_json']}")
|
352 |
-
logger.info(f"Fichiers Parquet: {stats['fichiers_parquet']}")
|
353 |
-
logger.info(f"Contient des questions: {'Oui' if stats['a_questions'] else 'Non'}")
|
354 |
-
|
355 |
-
if stats['a_questions']:
|
356 |
-
logger.info(f"Nombre de questions: {stats['nb_questions']}")
|
357 |
-
|
358 |
-
if 'types_documents' in stats and stats['types_documents']:
|
359 |
-
logger.info(f"Types de documents: {', '.join(stats['types_documents'])}")
|
360 |
-
|
361 |
-
# Afficher la structure des fichiers Parquet
|
362 |
-
if 'structure_parquet' in stats and stats['structure_parquet']:
|
363 |
-
logger.info("\n=== Structure des fichiers Parquet ===")
|
364 |
-
for category, info in stats['structure_parquet'].items():
|
365 |
-
logger.info(f"\nCatégorie: {category}")
|
366 |
-
logger.info(f"Nombre de lignes: {info['nb_lignes']}")
|
367 |
-
logger.info(f"Colonnes: {', '.join(info['colonnes'])}")
|
368 |
-
|
369 |
-
if args.verbose and 'exemple' in info and info['exemple']:
|
370 |
-
logger.info("\nExemple de ligne:")
|
371 |
-
for key, value in info['exemple'].items():
|
372 |
-
# Tronquer les valeurs trop longues
|
373 |
-
if isinstance(value, str) and len(value) > 100:
|
374 |
-
value = value[:100] + "..."
|
375 |
-
logger.info(f" {key}: {value}")
|
376 |
-
|
377 |
-
# 3. Vérifier s'il existe des résultats d'évaluation
|
378 |
-
has_evaluations = tester.check_evaluation_results(args.dataset)
|
379 |
-
|
380 |
-
if has_evaluations:
|
381 |
-
# 4. Vérifier les performances des modèles
|
382 |
-
performances = tester.check_model_performances(args.dataset)
|
383 |
-
|
384 |
-
if performances:
|
385 |
-
logger.info("\n=== Classement des modèles ===")
|
386 |
-
# Trier les modèles par score (du plus élevé au plus bas)
|
387 |
-
sorted_models = sorted(performances.items(), key=lambda x: x[1], reverse=True)
|
388 |
-
for i, (model, score) in enumerate(sorted_models, 1):
|
389 |
-
logger.info(f"{i}. {model}: {score:.4f}")
|
390 |
-
|
391 |
-
logger.success("Test terminé !")
|
392 |
-
|
393 |
-
if __name__ == "__main__":
|
394 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
backend/yourbench_simple_demo.egg-info/SOURCES.txt
CHANGED
@@ -2,14 +2,6 @@ README.md
|
|
2 |
pyproject.toml
|
3 |
lighteval_task/__init__.py
|
4 |
lighteval_task/lighteval_task.py
|
5 |
-
tests/test_evaluation.py
|
6 |
-
tests/test_hf_upload.py
|
7 |
-
tests/test_inference.py
|
8 |
-
tests/test_lighteval.py
|
9 |
-
tests/test_openai.py
|
10 |
-
tests/test_parallel_lighteval.py
|
11 |
-
tests/test_provider_parallel_support.py
|
12 |
-
tests/test_yourbench_results.py
|
13 |
yourbench_simple_demo.egg-info/PKG-INFO
|
14 |
yourbench_simple_demo.egg-info/SOURCES.txt
|
15 |
yourbench_simple_demo.egg-info/dependency_links.txt
|
|
|
2 |
pyproject.toml
|
3 |
lighteval_task/__init__.py
|
4 |
lighteval_task/lighteval_task.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
yourbench_simple_demo.egg-info/PKG-INFO
|
6 |
yourbench_simple_demo.egg-info/SOURCES.txt
|
7 |
yourbench_simple_demo.egg-info/dependency_links.txt
|
frontend/src/components/BenchmarkCreateForm.jsx
CHANGED
@@ -1,48 +1,26 @@
|
|
1 |
-
import React, { useState, useRef
|
2 |
import {
|
3 |
Box,
|
4 |
Paper,
|
5 |
Typography,
|
6 |
CircularProgress,
|
7 |
-
Alert,
|
8 |
Button,
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
} from "@mui/material";
|
13 |
-
import { useLocation } from "react-router-dom";
|
14 |
import CloudUploadIcon from "@mui/icons-material/CloudUpload";
|
15 |
import AutoFixHighIcon from "@mui/icons-material/AutoFixHigh";
|
16 |
-
import
|
|
|
|
|
|
|
17 |
import { useThemeMode } from "../hooks/useThemeMode";
|
18 |
import getTheme from "../config/theme";
|
19 |
import API_CONFIG from "../config/api";
|
20 |
|
21 |
/**
|
22 |
-
* Component
|
23 |
-
*
|
24 |
-
* @param {Object} props - Component props
|
25 |
-
* @param {number} props.activeStep - Current active step (0-based index)
|
26 |
-
* @returns {JSX.Element} Stepper component
|
27 |
-
*/
|
28 |
-
const StepsDisplay = ({ activeStep }) => {
|
29 |
-
const steps = ["Login", "Upload File", "Generate"];
|
30 |
-
|
31 |
-
return (
|
32 |
-
<Box sx={{ width: "100%", mb: 4 }}>
|
33 |
-
<Stepper activeStep={activeStep} alternativeLabel>
|
34 |
-
{steps.map((label) => (
|
35 |
-
<Step key={label}>
|
36 |
-
<StepLabel>{label}</StepLabel>
|
37 |
-
</Step>
|
38 |
-
))}
|
39 |
-
</Stepper>
|
40 |
-
</Box>
|
41 |
-
);
|
42 |
-
};
|
43 |
-
|
44 |
-
/**
|
45 |
-
* Component for creating a new benchmark, including authentication, file upload, and generation initiation
|
46 |
*
|
47 |
* @param {Object} props - Component props
|
48 |
* @param {Function} props.onStartGeneration - Callback when generation starts with sessionId
|
@@ -54,31 +32,36 @@ function BenchmarkCreateForm({ onStartGeneration }) {
|
|
54 |
const [isDragging, setIsDragging] = useState(false);
|
55 |
const [uploadStatus, setUploadStatus] = useState(null);
|
56 |
const [isLoading, setIsLoading] = useState(false);
|
57 |
-
const [activeStep, setActiveStep] = useState(0);
|
58 |
const [sessionId, setSessionId] = useState(null);
|
|
|
|
|
|
|
59 |
const fileInputRef = useRef(null);
|
60 |
-
const location = useLocation();
|
61 |
-
|
62 |
-
// Check if we're coming back from an OAuth redirect
|
63 |
-
useEffect(() => {
|
64 |
-
// If we have code in URL parameters, it's an OAuth callback
|
65 |
-
const params = new URLSearchParams(window.location.search);
|
66 |
-
if (params.has("code")) {
|
67 |
-
console.log("Detected OAuth callback, cleaning URL");
|
68 |
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
if (storedAuth) {
|
76 |
-
console.log("Found auth data after redirect, refreshing UI state");
|
77 |
-
setActiveStep(1); // Move to next step if authenticated
|
78 |
-
}
|
79 |
-
}, 1000);
|
80 |
-
}
|
81 |
-
}, [location]);
|
82 |
|
83 |
const handleDragOver = (e) => {
|
84 |
e.preventDefault();
|
@@ -97,7 +80,7 @@ function BenchmarkCreateForm({ onStartGeneration }) {
|
|
97 |
const file = e.target.files[0];
|
98 |
if (!file) return;
|
99 |
|
100 |
-
//
|
101 |
if (
|
102 |
!file.name.endsWith(".pdf") &&
|
103 |
!file.name.endsWith(".txt") &&
|
@@ -117,6 +100,8 @@ function BenchmarkCreateForm({ onStartGeneration }) {
|
|
117 |
const handleFileUpload = async (file) => {
|
118 |
setIsLoading(true);
|
119 |
setUploadStatus(null);
|
|
|
|
|
120 |
|
121 |
try {
|
122 |
const formData = new FormData();
|
@@ -134,20 +119,22 @@ function BenchmarkCreateForm({ onStartGeneration }) {
|
|
134 |
success: true,
|
135 |
message: `File ${result.filename} uploaded successfully`,
|
136 |
});
|
137 |
-
|
138 |
setSessionId(result.session_id);
|
139 |
-
|
140 |
} else {
|
141 |
setUploadStatus({
|
142 |
success: false,
|
143 |
message: result.error || "Upload failed",
|
144 |
});
|
|
|
145 |
}
|
146 |
} catch (error) {
|
147 |
setUploadStatus({
|
148 |
success: false,
|
149 |
message: "Server connection error",
|
150 |
});
|
|
|
151 |
} finally {
|
152 |
setIsLoading(false);
|
153 |
}
|
@@ -163,7 +150,7 @@ function BenchmarkCreateForm({ onStartGeneration }) {
|
|
163 |
return;
|
164 |
}
|
165 |
|
166 |
-
//
|
167 |
if (
|
168 |
!file.name.endsWith(".pdf") &&
|
169 |
!file.name.endsWith(".txt") &&
|
@@ -180,114 +167,175 @@ function BenchmarkCreateForm({ onStartGeneration }) {
|
|
180 |
handleFileUpload(file);
|
181 |
};
|
182 |
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
const handleGenerateClick = () => {
|
184 |
if (onStartGeneration && sessionId) {
|
185 |
-
onStartGeneration(sessionId);
|
186 |
}
|
187 |
};
|
188 |
|
189 |
return (
|
190 |
-
|
191 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
|
193 |
-
{
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
? `2px dashed ${theme.palette.primary.main}`
|
211 |
-
: "2px dashed
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
|
|
248 |
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
|
|
|
|
259 |
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
justifyContent: "center",
|
271 |
-
minHeight: 200,
|
272 |
-
}}
|
273 |
>
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
Ready to generate your benchmark
|
279 |
-
</Typography>
|
280 |
-
<Button
|
281 |
-
variant="contained"
|
282 |
-
color="primary"
|
283 |
-
onClick={handleGenerateClick}
|
284 |
-
sx={{ mt: 2 }}
|
285 |
-
>
|
286 |
-
Generate Benchmark
|
287 |
-
</Button>
|
288 |
-
</Paper>
|
289 |
-
)}
|
290 |
-
</>
|
291 |
);
|
292 |
}
|
293 |
|
|
|
1 |
+
import React, { useState, useRef } from "react";
|
2 |
import {
|
3 |
Box,
|
4 |
Paper,
|
5 |
Typography,
|
6 |
CircularProgress,
|
|
|
7 |
Button,
|
8 |
+
Snackbar,
|
9 |
+
Alert,
|
10 |
+
Grid,
|
11 |
} from "@mui/material";
|
|
|
12 |
import CloudUploadIcon from "@mui/icons-material/CloudUpload";
|
13 |
import AutoFixHighIcon from "@mui/icons-material/AutoFixHigh";
|
14 |
+
import InsertDriveFileIcon from "@mui/icons-material/InsertDriveFile";
|
15 |
+
import DescriptionIcon from "@mui/icons-material/Description";
|
16 |
+
import ArticleIcon from "@mui/icons-material/Article";
|
17 |
+
import MenuBookIcon from "@mui/icons-material/MenuBook";
|
18 |
import { useThemeMode } from "../hooks/useThemeMode";
|
19 |
import getTheme from "../config/theme";
|
20 |
import API_CONFIG from "../config/api";
|
21 |
|
22 |
/**
|
23 |
+
* Component for creating a new benchmark, including file upload and generation initiation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
*
|
25 |
* @param {Object} props - Component props
|
26 |
* @param {Function} props.onStartGeneration - Callback when generation starts with sessionId
|
|
|
32 |
const [isDragging, setIsDragging] = useState(false);
|
33 |
const [uploadStatus, setUploadStatus] = useState(null);
|
34 |
const [isLoading, setIsLoading] = useState(false);
|
|
|
35 |
const [sessionId, setSessionId] = useState(null);
|
36 |
+
const [openSnackbar, setOpenSnackbar] = useState(false);
|
37 |
+
const [selectedDocument, setSelectedDocument] = useState(null);
|
38 |
+
const [isDefaultDocument, setIsDefaultDocument] = useState(false);
|
39 |
const fileInputRef = useRef(null);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
+
const defaultDocuments = [
|
42 |
+
{
|
43 |
+
id: "the-bitter-lesson",
|
44 |
+
name: "The Bitter Lesson",
|
45 |
+
icon: <ArticleIcon sx={{ fontSize: 40 }} />,
|
46 |
+
description: "A seminal paper on AI development by Rich Sutton",
|
47 |
+
},
|
48 |
+
{
|
49 |
+
id: "hurricane-faq",
|
50 |
+
name: "Hurricane FAQ",
|
51 |
+
icon: <DescriptionIcon sx={{ fontSize: 40 }} />,
|
52 |
+
description: "Frequently asked questions about hurricanes",
|
53 |
+
},
|
54 |
+
{
|
55 |
+
id: "pokemon-guide",
|
56 |
+
name: "Pokemon Guide",
|
57 |
+
icon: <MenuBookIcon sx={{ fontSize: 40 }} />,
|
58 |
+
description: "A comprehensive guide to Pokemon",
|
59 |
+
},
|
60 |
+
];
|
61 |
|
62 |
+
const handleCloseSnackbar = () => {
|
63 |
+
setOpenSnackbar(false);
|
64 |
+
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
const handleDragOver = (e) => {
|
67 |
e.preventDefault();
|
|
|
80 |
const file = e.target.files[0];
|
81 |
if (!file) return;
|
82 |
|
83 |
+
// Check if it's a PDF, TXT, HTML or MD
|
84 |
if (
|
85 |
!file.name.endsWith(".pdf") &&
|
86 |
!file.name.endsWith(".txt") &&
|
|
|
100 |
const handleFileUpload = async (file) => {
|
101 |
setIsLoading(true);
|
102 |
setUploadStatus(null);
|
103 |
+
setIsDefaultDocument(false);
|
104 |
+
setSelectedDocument(null);
|
105 |
|
106 |
try {
|
107 |
const formData = new FormData();
|
|
|
119 |
success: true,
|
120 |
message: `File ${result.filename} uploaded successfully`,
|
121 |
});
|
122 |
+
setOpenSnackbar(true);
|
123 |
setSessionId(result.session_id);
|
124 |
+
setSelectedDocument({ name: file.name });
|
125 |
} else {
|
126 |
setUploadStatus({
|
127 |
success: false,
|
128 |
message: result.error || "Upload failed",
|
129 |
});
|
130 |
+
setOpenSnackbar(true);
|
131 |
}
|
132 |
} catch (error) {
|
133 |
setUploadStatus({
|
134 |
success: false,
|
135 |
message: "Server connection error",
|
136 |
});
|
137 |
+
setOpenSnackbar(true);
|
138 |
} finally {
|
139 |
setIsLoading(false);
|
140 |
}
|
|
|
150 |
return;
|
151 |
}
|
152 |
|
153 |
+
// Check if it's a PDF, TXT, HTML or MD
|
154 |
if (
|
155 |
!file.name.endsWith(".pdf") &&
|
156 |
!file.name.endsWith(".txt") &&
|
|
|
167 |
handleFileUpload(file);
|
168 |
};
|
169 |
|
170 |
+
const handleDefaultDocClick = (doc) => {
|
171 |
+
setSelectedDocument(doc);
|
172 |
+
setSessionId(doc.id);
|
173 |
+
setIsDefaultDocument(true);
|
174 |
+
};
|
175 |
+
|
176 |
const handleGenerateClick = () => {
|
177 |
if (onStartGeneration && sessionId) {
|
178 |
+
onStartGeneration(sessionId, isDefaultDocument);
|
179 |
}
|
180 |
};
|
181 |
|
182 |
return (
|
183 |
+
<Box sx={{ mt: -2 }}>
|
184 |
+
<Typography
|
185 |
+
variant="subtitle1"
|
186 |
+
component="div"
|
187 |
+
align="center"
|
188 |
+
sx={{ mb: 2, color: "text.secondary" }}
|
189 |
+
>
|
190 |
+
Choose a sample document
|
191 |
+
</Typography>
|
192 |
|
193 |
+
<Grid container spacing={2} sx={{ mb: 2 }}>
|
194 |
+
{defaultDocuments.map((doc) => (
|
195 |
+
<Grid item xs={12} md={4} key={doc.id}>
|
196 |
+
<Box
|
197 |
+
elevation={2}
|
198 |
+
sx={{
|
199 |
+
p: 2,
|
200 |
+
display: "flex",
|
201 |
+
flexDirection: "column",
|
202 |
+
borderRadius: 1.5,
|
203 |
+
alignItems: "center",
|
204 |
+
cursor: "pointer",
|
205 |
+
transition: "all 0.2s ease",
|
206 |
+
height: "100%",
|
207 |
+
// border: "2px solid rgba(0, 0, 0, 0.1)",
|
208 |
+
border:
|
209 |
+
selectedDocument?.id === doc.id
|
210 |
+
? `2px solid ${theme.palette.primary.main}`
|
211 |
+
: "2px solid rgba(0, 0, 0, 0.1)",
|
212 |
+
"&:hover": {
|
213 |
+
transform: "translateY(-2px)",
|
214 |
+
boxShadow: 3,
|
215 |
+
},
|
216 |
+
}}
|
217 |
+
onClick={() => handleDefaultDocClick(doc)}
|
218 |
+
>
|
219 |
+
<Box sx={{ color: "primary.main", mb: 1 }}>{doc.icon}</Box>
|
220 |
+
<Typography variant="subtitle1" component="div" gutterBottom>
|
221 |
+
{doc.name}
|
222 |
+
</Typography>
|
223 |
+
<Typography
|
224 |
+
variant="body2"
|
225 |
+
color="text.secondary"
|
226 |
+
align="center"
|
227 |
+
sx={{ flexGrow: 1 }}
|
228 |
+
>
|
229 |
+
{doc.description}
|
230 |
+
</Typography>
|
231 |
+
</Box>
|
232 |
+
</Grid>
|
233 |
+
))}
|
234 |
+
</Grid>
|
235 |
|
236 |
+
<Typography
|
237 |
+
variant="subtitle1"
|
238 |
+
component="div"
|
239 |
+
align="center"
|
240 |
+
sx={{ mb: 2, color: "text.secondary" }}
|
241 |
+
>
|
242 |
+
Or upload your own ...
|
243 |
+
</Typography>
|
244 |
+
|
245 |
+
<Box
|
246 |
+
sx={{
|
247 |
+
p: 4,
|
248 |
+
mt: 2,
|
249 |
+
mb: 2,
|
250 |
+
borderRadius: 1.5,
|
251 |
+
border:
|
252 |
+
selectedDocument?.name && !isDefaultDocument
|
253 |
+
? `2px solid ${theme.palette.primary.main}`
|
254 |
+
: isDragging
|
255 |
? `2px dashed ${theme.palette.primary.main}`
|
256 |
+
: "2px dashed rgba(0, 0, 0, 0.16)",
|
257 |
+
backgroundColor: isDragging ? "rgba(0, 0, 0, 0.05)" : "transparent",
|
258 |
+
display: "flex",
|
259 |
+
flexDirection: "column",
|
260 |
+
alignItems: "center",
|
261 |
+
justifyContent: "center",
|
262 |
+
minHeight: 180,
|
263 |
+
cursor: "pointer",
|
264 |
+
transition: "all 0.3s ease",
|
265 |
+
}}
|
266 |
+
onDragOver={handleDragOver}
|
267 |
+
onDragLeave={handleDragLeave}
|
268 |
+
onDrop={handleDrop}
|
269 |
+
onClick={handleClick}
|
270 |
+
>
|
271 |
+
<input
|
272 |
+
type="file"
|
273 |
+
ref={fileInputRef}
|
274 |
+
onChange={handleFileChange}
|
275 |
+
accept=".pdf,.txt,.html,.md"
|
276 |
+
style={{ display: "none" }}
|
277 |
+
/>
|
278 |
+
{selectedDocument?.name && !isDefaultDocument ? (
|
279 |
+
<>
|
280 |
+
<InsertDriveFileIcon
|
281 |
+
sx={{ fontSize: 50, color: "primary.main", mb: 1 }}
|
282 |
+
/>
|
283 |
+
<Typography variant="h6" component="div" gutterBottom>
|
284 |
+
{selectedDocument.name}
|
285 |
+
</Typography>
|
286 |
+
<Typography variant="body2" color="text.secondary">
|
287 |
+
Click to upload a different file
|
288 |
+
</Typography>
|
289 |
+
</>
|
290 |
+
) : (
|
291 |
+
<>
|
292 |
+
<CloudUploadIcon
|
293 |
+
sx={{ fontSize: 50, color: "primary.main", mb: 1 }}
|
294 |
+
/>
|
295 |
+
<Typography variant="h6" component="div" gutterBottom>
|
296 |
+
Drag and drop your file here or click to browse
|
297 |
+
</Typography>
|
298 |
+
<Typography variant="body2" color="text.secondary">
|
299 |
+
Accepted formats: PDF, TXT, HTML, MD
|
300 |
+
</Typography>
|
301 |
+
</>
|
302 |
+
)}
|
303 |
|
304 |
+
{isLoading && (
|
305 |
+
<Box sx={{ mt: 2 }}>
|
306 |
+
<CircularProgress size={30} />
|
307 |
+
</Box>
|
308 |
+
)}
|
309 |
+
</Box>
|
310 |
|
311 |
+
<Box sx={{ display: "flex", justifyContent: "center" }}>
|
312 |
+
<Button
|
313 |
+
variant="contained"
|
314 |
+
color="primary"
|
315 |
+
onClick={handleGenerateClick}
|
316 |
+
startIcon={<AutoFixHighIcon />}
|
317 |
+
disabled={!sessionId}
|
318 |
+
sx={{ mt: 2 }}
|
319 |
+
>
|
320 |
+
Generate Benchmark
|
321 |
+
</Button>
|
322 |
+
</Box>
|
323 |
|
324 |
+
<Snackbar
|
325 |
+
open={openSnackbar}
|
326 |
+
autoHideDuration={6000}
|
327 |
+
onClose={handleCloseSnackbar}
|
328 |
+
anchorOrigin={{ vertical: "bottom", horizontal: "right" }}
|
329 |
+
>
|
330 |
+
<Alert
|
331 |
+
onClose={handleCloseSnackbar}
|
332 |
+
severity={uploadStatus?.success ? "success" : "error"}
|
333 |
+
sx={{ width: "100%" }}
|
|
|
|
|
|
|
334 |
>
|
335 |
+
{uploadStatus?.message}
|
336 |
+
</Alert>
|
337 |
+
</Snackbar>
|
338 |
+
</Box>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
339 |
);
|
340 |
}
|
341 |
|
frontend/src/components/BenchmarkEvaluation.jsx
CHANGED
@@ -3,15 +3,28 @@ import { Box, Typography, CircularProgress, Alert, Paper } from "@mui/material";
|
|
3 |
import { useNavigate, useSearchParams } from "react-router-dom";
|
4 |
import API_CONFIG from "../config/api";
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
// Starting messages with their timing
|
7 |
const STARTING_MESSAGES = [
|
8 |
-
{ message: "Initializing evaluation environment...", progress:
|
9 |
-
{ message: "Starting evaluation process...", progress:
|
10 |
-
{ message: "Evaluating models...", progress:
|
11 |
-
{ message: "Storing evaluation results...", progress:
|
12 |
];
|
13 |
|
14 |
-
const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
|
|
|
|
|
|
|
|
|
15 |
const [evaluationComplete, setEvaluationComplete] = useState(false);
|
16 |
const [error, setError] = useState(null);
|
17 |
const [elapsedTime, setElapsedTime] = useState(0);
|
@@ -21,6 +34,7 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
|
|
21 |
const startTimeRef = useRef(null);
|
22 |
const startingMessageIntervalRef = useRef(null);
|
23 |
const pollingIntervalRef = useRef(null);
|
|
|
24 |
|
25 |
const navigate = useNavigate();
|
26 |
|
@@ -33,21 +47,26 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
|
|
33 |
|
34 |
// Add effect to handle starting messages
|
35 |
useEffect(() => {
|
36 |
-
startingMessageIntervalRef.current = setInterval(
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
return () => {
|
46 |
if (startingMessageIntervalRef.current) {
|
47 |
clearInterval(startingMessageIntervalRef.current);
|
48 |
}
|
49 |
};
|
50 |
-
}, []);
|
51 |
|
52 |
// Start evaluation when component mounts
|
53 |
useEffect(() => {
|
@@ -62,7 +81,11 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
|
|
62 |
setElapsedTime(timeElapsed);
|
63 |
}, 1000);
|
64 |
|
65 |
-
|
|
|
|
|
|
|
|
|
66 |
|
67 |
// Clean up intervals on unmount
|
68 |
return () => {
|
@@ -72,8 +95,25 @@ const BenchmarkEvaluation = ({ sessionId, onComplete }) => {
|
|
72 |
if (timerIntervalRef.current) {
|
73 |
clearInterval(timerIntervalRef.current);
|
74 |
}
|
|
|
|
|
|
|
75 |
};
|
76 |
-
}, []);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
// Format elapsed time as HH:MM:SS
|
79 |
const formatElapsedTime = () => {
|
|
|
3 |
import { useNavigate, useSearchParams } from "react-router-dom";
|
4 |
import API_CONFIG from "../config/api";
|
5 |
|
6 |
+
// Temps de simulation en millisecondes pour les documents précalculés
|
7 |
+
const SIMULATION_DURATION = 20000; // 20 secondes
|
8 |
+
|
9 |
+
// Intervalle de changement des messages pour les documents standards vs précalculés
|
10 |
+
const MESSAGE_CHANGE_INTERVAL = {
|
11 |
+
DEFAULT: 20000, // 20 secondes pour documents standards
|
12 |
+
PRECALCULATED: 5000, // 5 secondes pour documents précalculés
|
13 |
+
};
|
14 |
+
|
15 |
// Starting messages with their timing
|
16 |
const STARTING_MESSAGES = [
|
17 |
+
{ message: "Initializing evaluation environment...", progress: 0 },
|
18 |
+
{ message: "Starting evaluation process...", progress: 27 },
|
19 |
+
{ message: "Evaluating models...", progress: 54 },
|
20 |
+
{ message: "Storing evaluation results...", progress: 84 },
|
21 |
];
|
22 |
|
23 |
+
const BenchmarkEvaluation = ({ sessionId, isDefaultDocument, onComplete }) => {
|
24 |
+
const [searchParams] = useSearchParams();
|
25 |
+
const isDefault =
|
26 |
+
isDefaultDocument ||
|
27 |
+
["the-bitter-lesson", "hurricane-faq", "pokemon-guide"].includes(sessionId);
|
28 |
const [evaluationComplete, setEvaluationComplete] = useState(false);
|
29 |
const [error, setError] = useState(null);
|
30 |
const [elapsedTime, setElapsedTime] = useState(0);
|
|
|
34 |
const startTimeRef = useRef(null);
|
35 |
const startingMessageIntervalRef = useRef(null);
|
36 |
const pollingIntervalRef = useRef(null);
|
37 |
+
const simulationTimeoutRef = useRef(null);
|
38 |
|
39 |
const navigate = useNavigate();
|
40 |
|
|
|
47 |
|
48 |
// Add effect to handle starting messages
|
49 |
useEffect(() => {
|
50 |
+
startingMessageIntervalRef.current = setInterval(
|
51 |
+
() => {
|
52 |
+
setStartingMessageIndex((prev) => {
|
53 |
+
if (prev < STARTING_MESSAGES.length - 1) {
|
54 |
+
return prev + 1;
|
55 |
+
}
|
56 |
+
return prev;
|
57 |
+
});
|
58 |
+
},
|
59 |
+
isDefault
|
60 |
+
? MESSAGE_CHANGE_INTERVAL.PRECALCULATED
|
61 |
+
: MESSAGE_CHANGE_INTERVAL.DEFAULT
|
62 |
+
);
|
63 |
|
64 |
return () => {
|
65 |
if (startingMessageIntervalRef.current) {
|
66 |
clearInterval(startingMessageIntervalRef.current);
|
67 |
}
|
68 |
};
|
69 |
+
}, [isDefault]);
|
70 |
|
71 |
// Start evaluation when component mounts
|
72 |
useEffect(() => {
|
|
|
81 |
setElapsedTime(timeElapsed);
|
82 |
}, 1000);
|
83 |
|
84 |
+
if (isDefault) {
|
85 |
+
simulateEvaluation();
|
86 |
+
} else {
|
87 |
+
startEvaluation();
|
88 |
+
}
|
89 |
|
90 |
// Clean up intervals on unmount
|
91 |
return () => {
|
|
|
95 |
if (timerIntervalRef.current) {
|
96 |
clearInterval(timerIntervalRef.current);
|
97 |
}
|
98 |
+
if (simulationTimeoutRef.current) {
|
99 |
+
clearTimeout(simulationTimeoutRef.current);
|
100 |
+
}
|
101 |
};
|
102 |
+
}, [isDefault]);
|
103 |
+
|
104 |
+
// Simulate the evaluation process for pre-calculated documents
|
105 |
+
const simulateEvaluation = () => {
|
106 |
+
// Complete after 20 seconds
|
107 |
+
simulationTimeoutRef.current = setTimeout(() => {
|
108 |
+
setEvaluationComplete(true);
|
109 |
+
|
110 |
+
if (startingMessageIntervalRef.current) {
|
111 |
+
clearInterval(startingMessageIntervalRef.current);
|
112 |
+
}
|
113 |
+
|
114 |
+
setStartingMessageIndex(STARTING_MESSAGES.length - 1); // Set to last message
|
115 |
+
}, SIMULATION_DURATION);
|
116 |
+
};
|
117 |
|
118 |
// Format elapsed time as HH:MM:SS
|
119 |
const formatElapsedTime = () => {
|
frontend/src/components/BenchmarkGenerator.jsx
CHANGED
@@ -6,6 +6,9 @@ import LogDisplay from "./LogDisplay";
|
|
6 |
import { useNavigate, useSearchParams } from "react-router-dom";
|
7 |
import API_CONFIG from "../config/api";
|
8 |
|
|
|
|
|
|
|
9 |
// Define all benchmark steps in sequence
|
10 |
const BENCHMARK_STEPS = [
|
11 |
"ingestion",
|
@@ -28,15 +31,39 @@ const STEP_LABELS = {
|
|
28 |
lighteval: "LightEval",
|
29 |
};
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
/**
|
32 |
* Component to handle benchmark generation and display logs
|
33 |
*
|
34 |
* @param {Object} props - Component props
|
35 |
* @param {string} props.sessionId - The session ID for the uploaded file
|
|
|
36 |
* @param {Function} props.onComplete - Function to call when generation is complete
|
37 |
* @returns {JSX.Element} Benchmark generator component
|
38 |
*/
|
39 |
-
const BenchmarkGenerator = ({ sessionId, onComplete }) => {
|
|
|
|
|
|
|
40 |
const [generating, setGenerating] = useState(false);
|
41 |
const [generationComplete, setGenerationComplete] = useState(false);
|
42 |
const [generationLogs, setGenerationLogs] = useState([]);
|
@@ -55,6 +82,9 @@ const BenchmarkGenerator = ({ sessionId, onComplete }) => {
|
|
55 |
// Reference for starting time
|
56 |
const startTimeRef = useRef(null);
|
57 |
|
|
|
|
|
|
|
58 |
// Start generation on component mount
|
59 |
useEffect(() => {
|
60 |
// Set start time
|
@@ -68,7 +98,11 @@ const BenchmarkGenerator = ({ sessionId, onComplete }) => {
|
|
68 |
setElapsedTime(timeElapsed);
|
69 |
}, 1000);
|
70 |
|
71 |
-
|
|
|
|
|
|
|
|
|
72 |
|
73 |
// Clean up the polling interval and timer when the component unmounts
|
74 |
return () => {
|
@@ -78,8 +112,56 @@ const BenchmarkGenerator = ({ sessionId, onComplete }) => {
|
|
78 |
if (timerIntervalRef.current) {
|
79 |
clearInterval(timerIntervalRef.current);
|
80 |
}
|
|
|
|
|
|
|
81 |
};
|
82 |
-
}, []);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
// Determine the current phase and completed steps based on logs
|
85 |
useEffect(() => {
|
@@ -116,6 +198,9 @@ const BenchmarkGenerator = ({ sessionId, onComplete }) => {
|
|
116 |
setActiveStep(newActiveStep);
|
117 |
}
|
118 |
|
|
|
|
|
|
|
119 |
// Check the latest logs to determine the current phase
|
120 |
const recentLogs = generationLogs.slice(-10); // Check more logs
|
121 |
|
@@ -157,7 +242,14 @@ const BenchmarkGenerator = ({ sessionId, onComplete }) => {
|
|
157 |
) {
|
158 |
setCurrentPhase("configuring");
|
159 |
}
|
160 |
-
}, [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
|
162 |
const generateBenchmark = async () => {
|
163 |
if (!sessionId) {
|
|
|
6 |
import { useNavigate, useSearchParams } from "react-router-dom";
|
7 |
import API_CONFIG from "../config/api";
|
8 |
|
9 |
+
// Temps de simulation en millisecondes pour les documents précalculés
|
10 |
+
const SIMULATION_DURATION = 20000; // 20 secondes
|
11 |
+
|
12 |
// Define all benchmark steps in sequence
|
13 |
const BENCHMARK_STEPS = [
|
14 |
"ingestion",
|
|
|
31 |
lighteval: "LightEval",
|
32 |
};
|
33 |
|
34 |
+
// Simulated log messages for pre-calculated documents
|
35 |
+
const SIMULATED_LOGS = [
|
36 |
+
"[INFO] Initializing benchmark generation...",
|
37 |
+
"[INFO] Generating base configuration file...",
|
38 |
+
"[SUCCESS] Stage completed: ingestion",
|
39 |
+
"[INFO] Processing document content for upload...",
|
40 |
+
"[SUCCESS] Stage completed: upload_ingest_to_hub",
|
41 |
+
"[INFO] Generating document summary...",
|
42 |
+
"[SUCCESS] Stage completed: summarization",
|
43 |
+
"[INFO] Chunking content for better analysis...",
|
44 |
+
"[SUCCESS] Stage completed: chunking",
|
45 |
+
"[INFO] Generating single-shot questions...",
|
46 |
+
"[SUCCESS] Stage completed: single_shot_question_generation",
|
47 |
+
"[INFO] Creating multi-hop questions from content...",
|
48 |
+
"[SUCCESS] Stage completed: multi_hop_question_generation",
|
49 |
+
"[INFO] Running LightEval for benchmark validation...",
|
50 |
+
"[SUCCESS] Stage completed: lighteval",
|
51 |
+
"[SUCCESS] Ingestion process completed successfully",
|
52 |
+
];
|
53 |
+
|
54 |
/**
|
55 |
* Component to handle benchmark generation and display logs
|
56 |
*
|
57 |
* @param {Object} props - Component props
|
58 |
* @param {string} props.sessionId - The session ID for the uploaded file
|
59 |
+
* @param {boolean} props.isDefaultDocument - Whether this is a pre-calculated document
|
60 |
* @param {Function} props.onComplete - Function to call when generation is complete
|
61 |
* @returns {JSX.Element} Benchmark generator component
|
62 |
*/
|
63 |
+
const BenchmarkGenerator = ({ sessionId, isDefaultDocument, onComplete }) => {
|
64 |
+
const [searchParams] = useSearchParams();
|
65 |
+
const isDefault =
|
66 |
+
searchParams.get("isDefault") === "true" || isDefaultDocument;
|
67 |
const [generating, setGenerating] = useState(false);
|
68 |
const [generationComplete, setGenerationComplete] = useState(false);
|
69 |
const [generationLogs, setGenerationLogs] = useState([]);
|
|
|
82 |
// Reference for starting time
|
83 |
const startTimeRef = useRef(null);
|
84 |
|
85 |
+
// Simulation interval reference
|
86 |
+
const simulationIntervalRef = useRef(null);
|
87 |
+
|
88 |
// Start generation on component mount
|
89 |
useEffect(() => {
|
90 |
// Set start time
|
|
|
98 |
setElapsedTime(timeElapsed);
|
99 |
}, 1000);
|
100 |
|
101 |
+
if (isDefault) {
|
102 |
+
simulateGeneration();
|
103 |
+
} else {
|
104 |
+
generateBenchmark();
|
105 |
+
}
|
106 |
|
107 |
// Clean up the polling interval and timer when the component unmounts
|
108 |
return () => {
|
|
|
112 |
if (timerIntervalRef.current) {
|
113 |
clearInterval(timerIntervalRef.current);
|
114 |
}
|
115 |
+
if (simulationIntervalRef.current) {
|
116 |
+
clearInterval(simulationIntervalRef.current);
|
117 |
+
}
|
118 |
};
|
119 |
+
}, [isDefault]);
|
120 |
+
|
121 |
+
// Simulate the benchmark generation for pre-calculated documents
|
122 |
+
const simulateGeneration = () => {
|
123 |
+
setGenerating(true);
|
124 |
+
setGenerationLogs([]);
|
125 |
+
setError(null);
|
126 |
+
setCurrentPhase("initializing");
|
127 |
+
setCompletedSteps([]);
|
128 |
+
setActiveStep(0);
|
129 |
+
|
130 |
+
// Timing variables for simulation
|
131 |
+
const totalSteps = SIMULATED_LOGS.length;
|
132 |
+
const totalDuration = SIMULATION_DURATION; // 20 seconds
|
133 |
+
const intervalPerStep = totalDuration / totalSteps;
|
134 |
+
let currentStep = 0;
|
135 |
+
|
136 |
+
// Function to add next log message
|
137 |
+
const addNextLog = () => {
|
138 |
+
if (currentStep < SIMULATED_LOGS.length) {
|
139 |
+
const newLogs = [...generationLogs, SIMULATED_LOGS[currentStep]];
|
140 |
+
setGenerationLogs(newLogs);
|
141 |
+
currentStep++;
|
142 |
+
|
143 |
+
// Check if completed
|
144 |
+
if (currentStep >= SIMULATED_LOGS.length) {
|
145 |
+
// Simulation complete
|
146 |
+
setTimeout(() => {
|
147 |
+
setCurrentPhase("complete");
|
148 |
+
setGenerationComplete(true);
|
149 |
+
clearInterval(simulationIntervalRef.current);
|
150 |
+
if (onComplete) {
|
151 |
+
onComplete({
|
152 |
+
success: true,
|
153 |
+
sessionId,
|
154 |
+
logs: newLogs,
|
155 |
+
});
|
156 |
+
}
|
157 |
+
}, 1000);
|
158 |
+
}
|
159 |
+
}
|
160 |
+
};
|
161 |
+
|
162 |
+
// Start simulation
|
163 |
+
simulationIntervalRef.current = setInterval(addNextLog, intervalPerStep);
|
164 |
+
};
|
165 |
|
166 |
// Determine the current phase and completed steps based on logs
|
167 |
useEffect(() => {
|
|
|
198 |
setActiveStep(newActiveStep);
|
199 |
}
|
200 |
|
201 |
+
// Skip the rest of the log processing if we're simulating
|
202 |
+
if (isDefault) return;
|
203 |
+
|
204 |
// Check the latest logs to determine the current phase
|
205 |
const recentLogs = generationLogs.slice(-10); // Check more logs
|
206 |
|
|
|
242 |
) {
|
243 |
setCurrentPhase("configuring");
|
244 |
}
|
245 |
+
}, [
|
246 |
+
generationLogs,
|
247 |
+
completedSteps,
|
248 |
+
activeStep,
|
249 |
+
sessionId,
|
250 |
+
onComplete,
|
251 |
+
isDefault,
|
252 |
+
]);
|
253 |
|
254 |
const generateBenchmark = async () => {
|
255 |
if (!sessionId) {
|
frontend/src/components/EvaluationDisplay.jsx
CHANGED
@@ -14,9 +14,77 @@ import {
|
|
14 |
Card,
|
15 |
CardContent,
|
16 |
Link,
|
|
|
17 |
} from "@mui/material";
|
18 |
import OpenInNewIcon from "@mui/icons-material/OpenInNew";
|
19 |
import CheckCircleIcon from "@mui/icons-material/CheckCircle";
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
const EvaluationDisplay = ({ sessionId }) => {
|
21 |
const [results, setResults] = useState(null);
|
22 |
const [loading, setLoading] = useState(true);
|
@@ -60,7 +128,23 @@ const EvaluationDisplay = ({ sessionId }) => {
|
|
60 |
|
61 |
// Format accuracy as percentage
|
62 |
const formatAccuracy = (value) => {
|
63 |
-
return `${(value * 100).toFixed(2)}%`;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
};
|
65 |
|
66 |
// Format evaluation time
|
@@ -125,14 +209,35 @@ const EvaluationDisplay = ({ sessionId }) => {
|
|
125 |
boxShadow: "0 2px 4px rgba(0,0,0,0.05)",
|
126 |
}}
|
127 |
>
|
128 |
-
<Table
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
<TableHead>
|
130 |
-
<TableRow
|
131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
<TableCell>Model</TableCell>
|
133 |
-
<TableCell align="
|
134 |
-
<TableCell align="
|
135 |
-
<TableCell align="
|
136 |
</TableRow>
|
137 |
</TableHead>
|
138 |
<TableBody>
|
@@ -142,35 +247,88 @@ const EvaluationDisplay = ({ sessionId }) => {
|
|
142 |
<TableRow
|
143 |
key={`${model.model_name}-${model.provider}`}
|
144 |
sx={{
|
145 |
-
"&:
|
|
|
|
|
146 |
}}
|
147 |
>
|
148 |
-
<TableCell>
|
|
|
|
|
|
|
|
|
149 |
<TableCell component="th" scope="row">
|
150 |
-
<
|
151 |
-
|
152 |
-
|
153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
sx={{
|
155 |
-
|
156 |
-
"
|
157 |
-
|
158 |
-
|
|
|
159 |
display: "flex",
|
160 |
alignItems: "center",
|
|
|
|
|
161 |
}}
|
162 |
>
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
</TableCell>
|
170 |
-
<TableCell align="
|
171 |
{formatTime(model.evaluation_time)}
|
172 |
</TableCell>
|
173 |
-
<TableCell align="
|
174 |
<span style={{ color: "green" }}>✓ Success</span>
|
175 |
</TableCell>
|
176 |
</TableRow>
|
|
|
14 |
Card,
|
15 |
CardContent,
|
16 |
Link,
|
17 |
+
Tooltip,
|
18 |
} from "@mui/material";
|
19 |
import OpenInNewIcon from "@mui/icons-material/OpenInNew";
|
20 |
import CheckCircleIcon from "@mui/icons-material/CheckCircle";
|
21 |
+
|
22 |
+
// Styles pour les médailles
|
23 |
+
const MEDAL_STYLES = {
|
24 |
+
1: {
|
25 |
+
color: "#B58A1B",
|
26 |
+
background: "linear-gradient(135deg, #FFF7E0 0%, #FFD700 100%)",
|
27 |
+
borderColor: "rgba(212, 160, 23, 0.35)",
|
28 |
+
shadowColor: "rgba(212, 160, 23, 0.8)",
|
29 |
+
},
|
30 |
+
2: {
|
31 |
+
color: "#667380",
|
32 |
+
background: "linear-gradient(135deg, #FFFFFF 0%, #D8E3ED 100%)",
|
33 |
+
borderColor: "rgba(124, 139, 153, 0.35)",
|
34 |
+
shadowColor: "rgba(124, 139, 153, 0.8)",
|
35 |
+
},
|
36 |
+
3: {
|
37 |
+
color: "#B85C2F",
|
38 |
+
background: "linear-gradient(135deg, #FDF0E9 0%, #FFBC8C 100%)",
|
39 |
+
borderColor: "rgba(204, 108, 61, 0.35)",
|
40 |
+
shadowColor: "rgba(204, 108, 61, 0.8)",
|
41 |
+
},
|
42 |
+
default: {
|
43 |
+
color: "text.primary",
|
44 |
+
background: "transparent",
|
45 |
+
borderColor: "transparent",
|
46 |
+
shadowColor: "transparent",
|
47 |
+
},
|
48 |
+
};
|
49 |
+
|
50 |
+
// Fonction pour obtenir le style de médaille en fonction du rang
|
51 |
+
const getMedalStyle = (rank) => {
|
52 |
+
if (rank <= 3) {
|
53 |
+
const medalStyle = MEDAL_STYLES[rank];
|
54 |
+
return {
|
55 |
+
color: medalStyle.color,
|
56 |
+
fontWeight: 900,
|
57 |
+
fontFamily: '"Inter", -apple-system, sans-serif',
|
58 |
+
width: "24px",
|
59 |
+
height: "24px",
|
60 |
+
background: medalStyle.background,
|
61 |
+
border: "1px solid",
|
62 |
+
borderColor: medalStyle.borderColor,
|
63 |
+
borderRadius: "50%",
|
64 |
+
display: "flex",
|
65 |
+
alignItems: "center",
|
66 |
+
justifyContent: "center",
|
67 |
+
fontSize: "0.95rem",
|
68 |
+
lineHeight: 1,
|
69 |
+
padding: 0,
|
70 |
+
boxShadow: `1px 1px 0 ${medalStyle.shadowColor}`,
|
71 |
+
marginRight: "8px",
|
72 |
+
};
|
73 |
+
}
|
74 |
+
// Pour les rangs > 3, même dimensions mais transparent
|
75 |
+
return {
|
76 |
+
color: "text.primary",
|
77 |
+
fontWeight: rank <= 10 ? 600 : 400,
|
78 |
+
width: "24px",
|
79 |
+
height: "24px",
|
80 |
+
display: "flex",
|
81 |
+
alignItems: "center",
|
82 |
+
justifyContent: "center",
|
83 |
+
fontSize: "0.95rem",
|
84 |
+
marginRight: "8px",
|
85 |
+
};
|
86 |
+
};
|
87 |
+
|
88 |
const EvaluationDisplay = ({ sessionId }) => {
|
89 |
const [results, setResults] = useState(null);
|
90 |
const [loading, setLoading] = useState(true);
|
|
|
128 |
|
129 |
// Format accuracy as percentage
|
130 |
const formatAccuracy = (value) => {
|
131 |
+
return `${(value * 100).toFixed(2)}\u2009%`;
|
132 |
+
};
|
133 |
+
|
134 |
+
// Fonction pour obtenir une couleur en fonction du score (rouge au vert)
|
135 |
+
const getColorForScore = (score) => {
|
136 |
+
// Convertir en pourcentage (0-100)
|
137 |
+
const percent = score * 100;
|
138 |
+
|
139 |
+
// Calcul de la couleur: rouge (0%) à vert (100%)
|
140 |
+
// Rouge diminue, vert augmente
|
141 |
+
const red = Math.max(
|
142 |
+
0,
|
143 |
+
Math.min(255, Math.round(255 * (1 - percent / 100)))
|
144 |
+
);
|
145 |
+
const green = Math.max(0, Math.min(255, Math.round(255 * (percent / 100))));
|
146 |
+
|
147 |
+
return `rgb(${red}, ${green}, 0)`;
|
148 |
};
|
149 |
|
150 |
// Format evaluation time
|
|
|
209 |
boxShadow: "0 2px 4px rgba(0,0,0,0.05)",
|
210 |
}}
|
211 |
>
|
212 |
+
<Table
|
213 |
+
sx={{
|
214 |
+
minWidth: 650,
|
215 |
+
"& .MuiTableCell-root": {
|
216 |
+
borderRight: "1px solid rgba(224, 224, 224, 1)",
|
217 |
+
borderBottom: "1px solid rgba(224, 224, 224, 1)",
|
218 |
+
"&:last-child": {
|
219 |
+
borderRight: "none",
|
220 |
+
},
|
221 |
+
},
|
222 |
+
"& .MuiTableRow-root:last-child .MuiTableCell-root": {
|
223 |
+
borderBottom: "1px solid rgba(224, 224, 224, 1)",
|
224 |
+
},
|
225 |
+
}}
|
226 |
+
>
|
227 |
<TableHead>
|
228 |
+
<TableRow
|
229 |
+
sx={{
|
230 |
+
"& .MuiTableCell-root": {
|
231 |
+
fontWeight: "bold",
|
232 |
+
backgroundColor: "rgba(0, 0, 0, 0.02)",
|
233 |
+
},
|
234 |
+
}}
|
235 |
+
>
|
236 |
+
<TableCell width="80px">Rank</TableCell>
|
237 |
<TableCell>Model</TableCell>
|
238 |
+
<TableCell align="left">Accuracy</TableCell>
|
239 |
+
<TableCell align="left">Eval Time</TableCell>
|
240 |
+
<TableCell align="right">Status</TableCell>
|
241 |
</TableRow>
|
242 |
</TableHead>
|
243 |
<TableBody>
|
|
|
247 |
<TableRow
|
248 |
key={`${model.model_name}-${model.provider}`}
|
249 |
sx={{
|
250 |
+
"&:nth-of-type(even)": {
|
251 |
+
backgroundColor: "rgba(0, 0, 0, 0.02)",
|
252 |
+
},
|
253 |
}}
|
254 |
>
|
255 |
+
<TableCell>
|
256 |
+
<Box sx={{ display: "flex", alignItems: "center" }}>
|
257 |
+
<Box sx={getMedalStyle(index + 1)}>{index + 1}</Box>
|
258 |
+
</Box>
|
259 |
+
</TableCell>
|
260 |
<TableCell component="th" scope="row">
|
261 |
+
<Tooltip title={model.model_name} placement="top">
|
262 |
+
<Link
|
263 |
+
href={`https://huggingface.co/${model.model_name}`}
|
264 |
+
target="_blank"
|
265 |
+
rel="noopener noreferrer"
|
266 |
+
sx={{
|
267 |
+
textDecoration: "none",
|
268 |
+
"&:hover": {
|
269 |
+
textDecoration: "underline",
|
270 |
+
},
|
271 |
+
display: "flex",
|
272 |
+
alignItems: "center",
|
273 |
+
}}
|
274 |
+
>
|
275 |
+
{model.model_name.length > 20
|
276 |
+
? `${model.model_name.substring(0, 20)}...`
|
277 |
+
: model.model_name}
|
278 |
+
<OpenInNewIcon sx={{ ml: 0.5, fontSize: 16 }} />
|
279 |
+
</Link>
|
280 |
+
</Tooltip>
|
281 |
+
</TableCell>
|
282 |
+
<TableCell
|
283 |
+
align="left"
|
284 |
+
sx={{
|
285 |
+
padding: 0,
|
286 |
+
position: "relative",
|
287 |
+
overflow: "hidden",
|
288 |
+
}}
|
289 |
+
>
|
290 |
+
<Box
|
291 |
sx={{
|
292 |
+
position: "absolute",
|
293 |
+
width: "100%",
|
294 |
+
height: "100%",
|
295 |
+
left: 0,
|
296 |
+
top: 0,
|
297 |
display: "flex",
|
298 |
alignItems: "center",
|
299 |
+
justifyContent: "flex-start",
|
300 |
+
pl: 2,
|
301 |
}}
|
302 |
>
|
303 |
+
<Box
|
304 |
+
sx={{
|
305 |
+
position: "absolute",
|
306 |
+
left: 0,
|
307 |
+
top: 0,
|
308 |
+
height: "100%",
|
309 |
+
width: `${model.accuracy * 100}%`,
|
310 |
+
backgroundColor: getColorForScore(model.accuracy),
|
311 |
+
opacity: 0.2,
|
312 |
+
zIndex: 0,
|
313 |
+
}}
|
314 |
+
/>
|
315 |
+
<Typography
|
316 |
+
sx={{
|
317 |
+
position: "relative",
|
318 |
+
zIndex: 1,
|
319 |
+
fontWeight: model.accuracy > 0.7 ? "bold" : "normal",
|
320 |
+
py: 1.5,
|
321 |
+
textAlign: "left",
|
322 |
+
}}
|
323 |
+
>
|
324 |
+
{formatAccuracy(model.accuracy)}
|
325 |
+
</Typography>
|
326 |
+
</Box>
|
327 |
</TableCell>
|
328 |
+
<TableCell align="left">
|
329 |
{formatTime(model.evaluation_time)}
|
330 |
</TableCell>
|
331 |
+
<TableCell align="right">
|
332 |
<span style={{ color: "green" }}>✓ Success</span>
|
333 |
</TableCell>
|
334 |
</TableRow>
|
frontend/src/components/Intro.jsx
CHANGED
@@ -1,21 +1,28 @@
|
|
1 |
import React from "react";
|
2 |
-
import { Box } from "@mui/material";
|
3 |
import HFLogo from "./Logo/HFLogo";
|
4 |
|
5 |
const Intro = () => (
|
6 |
-
<Box sx={{ textAlign: "center", mb:
|
7 |
<Box
|
8 |
sx={{ height: "60px", mb: 4, display: "flex", justifyContent: "center" }}
|
9 |
>
|
10 |
<HFLogo />
|
11 |
</Box>
|
12 |
-
<
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
YourBench is an <b>open-source framework</b> for generating{" "}
|
15 |
<b>domain-specific benchmarks</b> in a <b>zero-shot</b> manner. It aims to
|
16 |
keep your large language models on their toes—even as new data sources,
|
17 |
domains, and knowledge demands evolve.
|
18 |
-
</
|
19 |
</Box>
|
20 |
);
|
21 |
|
|
|
1 |
import React from "react";
|
2 |
+
import { Box, Typography } from "@mui/material";
|
3 |
import HFLogo from "./Logo/HFLogo";
|
4 |
|
5 |
const Intro = () => (
|
6 |
+
<Box sx={{ textAlign: "center", mb: 4 }}>
|
7 |
<Box
|
8 |
sx={{ height: "60px", mb: 4, display: "flex", justifyContent: "center" }}
|
9 |
>
|
10 |
<HFLogo />
|
11 |
</Box>
|
12 |
+
<Typography
|
13 |
+
variant="h4"
|
14 |
+
component="h1"
|
15 |
+
gutterBottom
|
16 |
+
sx={{ fontWeight: 800 }}
|
17 |
+
>
|
18 |
+
Yourbench Demo
|
19 |
+
</Typography>
|
20 |
+
<Typography variant="body1" sx={{ maxWidth: "800px", mx: "auto" }}>
|
21 |
YourBench is an <b>open-source framework</b> for generating{" "}
|
22 |
<b>domain-specific benchmarks</b> in a <b>zero-shot</b> manner. It aims to
|
23 |
keep your large language models on their toes—even as new data sources,
|
24 |
domains, and knowledge demands evolve.
|
25 |
+
</Typography>
|
26 |
</Box>
|
27 |
);
|
28 |
|
frontend/src/pages/BenchmarkDisplayPage.jsx
CHANGED
@@ -81,7 +81,16 @@ function BenchmarkDisplayPage() {
|
|
81 |
|
82 |
const handleStartEvaluation = () => {
|
83 |
console.log("Starting evaluation with session ID:", sessionId);
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
};
|
86 |
|
87 |
const defaultSampleQuestions = [
|
|
|
81 |
|
82 |
const handleStartEvaluation = () => {
|
83 |
console.log("Starting evaluation with session ID:", sessionId);
|
84 |
+
const isDefault = [
|
85 |
+
"the-bitter-lesson",
|
86 |
+
"hurricane-faq",
|
87 |
+
"pokemon-guide",
|
88 |
+
].includes(sessionId);
|
89 |
+
navigate(
|
90 |
+
`/benchmark-evaluation?session=${sessionId}&isDefault=${
|
91 |
+
isDefault ? "true" : "false"
|
92 |
+
}`
|
93 |
+
);
|
94 |
};
|
95 |
|
96 |
const defaultSampleQuestions = [
|
frontend/src/pages/BenchmarkEvaluationPage.jsx
CHANGED
@@ -8,6 +8,10 @@ function BenchmarkEvaluationPage() {
|
|
8 |
const navigate = useNavigate();
|
9 |
const [searchParams] = useSearchParams();
|
10 |
const sessionId = searchParams.get("session");
|
|
|
|
|
|
|
|
|
11 |
const [isValidSession, setIsValidSession] = useState(true);
|
12 |
const [isLoading, setIsLoading] = useState(true);
|
13 |
|
@@ -20,6 +24,12 @@ function BenchmarkEvaluationPage() {
|
|
20 |
return;
|
21 |
}
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
const checkSession = async () => {
|
24 |
try {
|
25 |
const response = await fetch(
|
@@ -41,10 +51,11 @@ function BenchmarkEvaluationPage() {
|
|
41 |
};
|
42 |
|
43 |
checkSession();
|
44 |
-
}, [sessionId]);
|
45 |
|
46 |
const handleEvaluationComplete = (result) => {
|
47 |
console.log("Évaluation terminée:", result);
|
|
|
48 |
};
|
49 |
|
50 |
if (!isValidSession) {
|
@@ -69,6 +80,7 @@ function BenchmarkEvaluationPage() {
|
|
69 |
) : (
|
70 |
<BenchmarkEvaluation
|
71 |
sessionId={sessionId}
|
|
|
72 |
onComplete={handleEvaluationComplete}
|
73 |
/>
|
74 |
)}
|
|
|
8 |
const navigate = useNavigate();
|
9 |
const [searchParams] = useSearchParams();
|
10 |
const sessionId = searchParams.get("session");
|
11 |
+
const isDefaultFromUrl = searchParams.get("isDefault") === "true";
|
12 |
+
const isDefault =
|
13 |
+
isDefaultFromUrl ||
|
14 |
+
["the-bitter-lesson", "hurricane-faq", "pokemon-guide"].includes(sessionId);
|
15 |
const [isValidSession, setIsValidSession] = useState(true);
|
16 |
const [isLoading, setIsLoading] = useState(true);
|
17 |
|
|
|
24 |
return;
|
25 |
}
|
26 |
|
27 |
+
// Si c'est un document précalculé, on le considère comme valide directement
|
28 |
+
if (isDefault) {
|
29 |
+
setIsLoading(false);
|
30 |
+
return;
|
31 |
+
}
|
32 |
+
|
33 |
const checkSession = async () => {
|
34 |
try {
|
35 |
const response = await fetch(
|
|
|
51 |
};
|
52 |
|
53 |
checkSession();
|
54 |
+
}, [sessionId, isDefault]);
|
55 |
|
56 |
const handleEvaluationComplete = (result) => {
|
57 |
console.log("Évaluation terminée:", result);
|
58 |
+
// La redirection est gérée par le composant BenchmarkEvaluation
|
59 |
};
|
60 |
|
61 |
if (!isValidSession) {
|
|
|
80 |
) : (
|
81 |
<BenchmarkEvaluation
|
82 |
sessionId={sessionId}
|
83 |
+
isDefaultDocument={isDefault}
|
84 |
onComplete={handleEvaluationComplete}
|
85 |
/>
|
86 |
)}
|
frontend/src/pages/BenchmarkGenerationPage.jsx
CHANGED
@@ -8,6 +8,7 @@ function BenchmarkGenerationPage() {
|
|
8 |
const navigate = useNavigate();
|
9 |
const [searchParams] = useSearchParams();
|
10 |
const sessionId = searchParams.get("session");
|
|
|
11 |
const [isValidSession, setIsValidSession] = useState(true);
|
12 |
|
13 |
useEffect(() => {
|
@@ -32,6 +33,7 @@ function BenchmarkGenerationPage() {
|
|
32 |
<Intro />
|
33 |
<BenchmarkGenerator
|
34 |
sessionId={sessionId}
|
|
|
35 |
onComplete={handleGenerationComplete}
|
36 |
/>
|
37 |
</>
|
|
|
8 |
const navigate = useNavigate();
|
9 |
const [searchParams] = useSearchParams();
|
10 |
const sessionId = searchParams.get("session");
|
11 |
+
const isDefault = searchParams.get("isDefault") === "true";
|
12 |
const [isValidSession, setIsValidSession] = useState(true);
|
13 |
|
14 |
useEffect(() => {
|
|
|
33 |
<Intro />
|
34 |
<BenchmarkGenerator
|
35 |
sessionId={sessionId}
|
36 |
+
isDefaultDocument={isDefault}
|
37 |
onComplete={handleGenerationComplete}
|
38 |
/>
|
39 |
</>
|
frontend/src/pages/HomePage.jsx
CHANGED
@@ -7,8 +7,12 @@ import BenchmarkCreateForm from "../components/BenchmarkCreateForm";
|
|
7 |
function HomePage() {
|
8 |
const navigate = useNavigate();
|
9 |
|
10 |
-
const handleStartGeneration = (sid) => {
|
11 |
-
navigate(
|
|
|
|
|
|
|
|
|
12 |
};
|
13 |
|
14 |
return (
|
|
|
7 |
function HomePage() {
|
8 |
const navigate = useNavigate();
|
9 |
|
10 |
+
const handleStartGeneration = (sid, isDefaultDocument) => {
|
11 |
+
navigate(
|
12 |
+
`/benchmark-generation?session=${sid}&isDefault=${
|
13 |
+
isDefaultDocument ? "true" : "false"
|
14 |
+
}`
|
15 |
+
);
|
16 |
};
|
17 |
|
18 |
return (
|