File size: 3,351 Bytes
39acd70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import pandas as pd
import sys
import re
from pprint import pprint

# Chemins vers les fichiers parquet
parquet_file_original = "data/lighteval_results/details/Qwen/Qwen2.5-72B-Instruct/2025-04-01T13-30-41.628107/details_custom|yourbench|0_2025-04-01T13-30-41.628107.parquet"
parquet_file_strict = "data/lighteval_results_strict_20250401_134031/details/Qwen/Qwen2.5-72B-Instruct/2025-04-01T13-41-03.260648/details_custom|yourbench|0_2025-04-01T13-41-03.260648.parquet"

try:
    # Charger les fichiers parquet
    print("Chargement des données...")
    df_original = pd.read_parquet(parquet_file_original)
    df_strict = pd.read_parquet(parquet_file_strict)
    
    # Afficher des informations de base
    print(f"Nombre d'exemples originaux: {len(df_original)}")
    print(f"Nombre d'exemples stricts: {len(df_strict)}")
    print(f"Scores originaux: {[metric.get('accuracy', 'N/A') for metric in df_original['metrics']]}")
    print(f"Scores stricts: {[metric.get('accuracy', 'N/A') for metric in df_strict['metrics']]}")
    
    print("\n" + "="*80 + "\n")
    print("COMPARAISON DES RÉSULTATS")
    print("="*80 + "\n")
    
    # Comparer les résultats
    for i in range(min(len(df_original), len(df_strict))):
        print(f"EXEMPLE {i+1}:")
        
        # Question
        question_orig = df_original.iloc[i].specifics.get('question', 'N/A')
        question_strict = df_strict.iloc[i].specifics.get('question', 'N/A')
        print(f"Question: {question_orig}")
        
        # Évaluation
        score_orig = df_original.iloc[i].metrics.get('accuracy', 'N/A')
        score_strict = df_strict.iloc[i].metrics.get('accuracy', 'N/A')
        print(f"Score original: {score_orig}")
        print(f"Score strict: {score_strict}")
        
        # Réponses
        model_answer_orig = df_original.iloc[i].predictions[0] if len(df_original.iloc[i].predictions) > 0 else "N/A"
        model_answer_strict = df_strict.iloc[i].predictions[0] if len(df_strict.iloc[i].predictions) > 0 else "N/A"
        
        # Référence
        reference_orig = df_original.iloc[i].choices[0] if len(df_original.iloc[i].choices) > 0 else "N/A"
        reference_strict = df_strict.iloc[i].choices[0] if len(df_strict.iloc[i].choices) > 0 else "N/A"
        
        # Comparaison des réponses - si identiques ou différentes
        responses_identical = model_answer_orig == model_answer_strict
        references_identical = reference_orig == reference_strict
        
        print(f"Réponses du modèle identiques: {'Oui' if responses_identical else 'Non'}")
        print(f"Références identiques: {'Oui' if references_identical else 'Non'}")
        
        # Afficher le changement qui a mené à une modification du résultat
        if score_orig != score_strict:
            print(f"\nRaison possible du changement de score:")
            print(f"  Critères d'évaluation plus stricts dans le prompt système")
            print(f"  Rejet des réponses contenant des nuances (however, but, although, etc.)")
        
        print("-"*80 + "\n")
        
except Exception as e:
    print(f"Erreur: {e}")
    
    if "df_original" in locals():
        print("\nColonnes dans df_original:", df_original.columns.tolist())
    
    if "df_strict" in locals():
        print("\nColonnes dans df_strict:", df_strict.columns.tolist())