Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import pandas as pd | |
import sys | |
import re | |
from pprint import pprint | |
# Chemins vers les fichiers parquet | |
parquet_file_original = "data/lighteval_results/details/Qwen/Qwen2.5-72B-Instruct/2025-04-01T13-30-41.628107/details_custom|yourbench|0_2025-04-01T13-30-41.628107.parquet" | |
parquet_file_strict = "data/lighteval_results_strict_20250401_134031/details/Qwen/Qwen2.5-72B-Instruct/2025-04-01T13-41-03.260648/details_custom|yourbench|0_2025-04-01T13-41-03.260648.parquet" | |
try: | |
# Charger les fichiers parquet | |
print("Chargement des données...") | |
df_original = pd.read_parquet(parquet_file_original) | |
df_strict = pd.read_parquet(parquet_file_strict) | |
# Afficher des informations de base | |
print(f"Nombre d'exemples originaux: {len(df_original)}") | |
print(f"Nombre d'exemples stricts: {len(df_strict)}") | |
print(f"Scores originaux: {[metric.get('accuracy', 'N/A') for metric in df_original['metrics']]}") | |
print(f"Scores stricts: {[metric.get('accuracy', 'N/A') for metric in df_strict['metrics']]}") | |
print("\n" + "="*80 + "\n") | |
print("COMPARAISON DES RÉSULTATS") | |
print("="*80 + "\n") | |
# Comparer les résultats | |
for i in range(min(len(df_original), len(df_strict))): | |
print(f"EXEMPLE {i+1}:") | |
# Question | |
question_orig = df_original.iloc[i].specifics.get('question', 'N/A') | |
question_strict = df_strict.iloc[i].specifics.get('question', 'N/A') | |
print(f"Question: {question_orig}") | |
# Évaluation | |
score_orig = df_original.iloc[i].metrics.get('accuracy', 'N/A') | |
score_strict = df_strict.iloc[i].metrics.get('accuracy', 'N/A') | |
print(f"Score original: {score_orig}") | |
print(f"Score strict: {score_strict}") | |
# Réponses | |
model_answer_orig = df_original.iloc[i].predictions[0] if len(df_original.iloc[i].predictions) > 0 else "N/A" | |
model_answer_strict = df_strict.iloc[i].predictions[0] if len(df_strict.iloc[i].predictions) > 0 else "N/A" | |
# Référence | |
reference_orig = df_original.iloc[i].choices[0] if len(df_original.iloc[i].choices) > 0 else "N/A" | |
reference_strict = df_strict.iloc[i].choices[0] if len(df_strict.iloc[i].choices) > 0 else "N/A" | |
# Comparaison des réponses - si identiques ou différentes | |
responses_identical = model_answer_orig == model_answer_strict | |
references_identical = reference_orig == reference_strict | |
print(f"Réponses du modèle identiques: {'Oui' if responses_identical else 'Non'}") | |
print(f"Références identiques: {'Oui' if references_identical else 'Non'}") | |
# Afficher le changement qui a mené à une modification du résultat | |
if score_orig != score_strict: | |
print(f"\nRaison possible du changement de score:") | |
print(f" Critères d'évaluation plus stricts dans le prompt système") | |
print(f" Rejet des réponses contenant des nuances (however, but, although, etc.)") | |
print("-"*80 + "\n") | |
except Exception as e: | |
print(f"Erreur: {e}") | |
if "df_original" in locals(): | |
print("\nColonnes dans df_original:", df_original.columns.tolist()) | |
if "df_strict" in locals(): | |
print("\nColonnes dans df_strict:", df_strict.columns.tolist()) |