Spaces:
Runtime error
Runtime error
Ahmed Ahmed
commited on
Commit
·
1191811
1
Parent(s):
1bac1ed
try again
Browse files- app.py +27 -118
- src/about.py +9 -25
- src/display/utils.py +2 -10
- src/evaluation/initialize_models.py +1 -3
- src/leaderboard/read_evals.py +7 -37
app.py
CHANGED
|
@@ -18,7 +18,6 @@ from src.display.utils import (
|
|
| 18 |
)
|
| 19 |
from src.envs import API, EVAL_RESULTS_PATH, RESULTS_REPO, TOKEN, OWNER
|
| 20 |
from src.populate import get_leaderboard_df
|
| 21 |
-
from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval
|
| 22 |
|
| 23 |
def create_results_dataframe():
|
| 24 |
"""Create and return the results DataFrame for display"""
|
|
@@ -36,17 +35,15 @@ def create_results_dataframe():
|
|
| 36 |
sys.stderr.write("⚠️ DataFrame is None or empty, returning empty DataFrame\n")
|
| 37 |
sys.stderr.flush()
|
| 38 |
# Return empty DataFrame with proper columns
|
| 39 |
-
return pd.DataFrame(columns=["Model", "
|
| 40 |
|
| 41 |
sys.stderr.write(f"📊 Original DataFrame columns: {list(df.columns)}\n")
|
| 42 |
sys.stderr.flush()
|
| 43 |
|
| 44 |
-
# Check if required columns exist
|
| 45 |
required_cols = [
|
| 46 |
AutoEvalColumn.model.name,
|
| 47 |
-
"Perplexity",
|
| 48 |
AutoEvalColumn.model_trace_p_value.name,
|
| 49 |
-
AutoEvalColumn.average.name,
|
| 50 |
AutoEvalColumn.model_type.name,
|
| 51 |
AutoEvalColumn.precision.name,
|
| 52 |
]
|
|
@@ -68,10 +65,10 @@ def create_results_dataframe():
|
|
| 68 |
except Exception as e:
|
| 69 |
sys.stderr.write(f"💥 Error selecting columns: {e}\n")
|
| 70 |
sys.stderr.flush()
|
| 71 |
-
return pd.DataFrame(columns=["Model", "
|
| 72 |
|
| 73 |
# Rename columns for better display
|
| 74 |
-
display_df.columns = ["Model", "
|
| 75 |
|
| 76 |
sys.stderr.write(f"🎯 Final display DataFrame shape: {display_df.shape}\n")
|
| 77 |
sys.stderr.write(f"🎯 Final columns: {list(display_df.columns)}\n")
|
|
@@ -84,64 +81,7 @@ def create_results_dataframe():
|
|
| 84 |
sys.stderr.flush()
|
| 85 |
return display_df
|
| 86 |
|
| 87 |
-
|
| 88 |
-
"""Run perplexity evaluation on demand."""
|
| 89 |
-
import sys
|
| 90 |
-
import traceback
|
| 91 |
-
import gradio as gr
|
| 92 |
-
from src.evaluation.initialize_models import is_model_allowed
|
| 93 |
-
|
| 94 |
-
if not model_name:
|
| 95 |
-
return "Please select a model.", gr.update(), gr.update()
|
| 96 |
-
|
| 97 |
-
if not is_model_allowed(model_name):
|
| 98 |
-
return f"❌ Model '{model_name}' is not in the allowed list. Please select from the dropdown.", gr.update(), gr.update()
|
| 99 |
-
|
| 100 |
-
try:
|
| 101 |
-
# Use stderr for more reliable logging in HF Spaces
|
| 102 |
-
sys.stderr.write(f"\n=== RUNNING PERPLEXITY TEST ===\n")
|
| 103 |
-
sys.stderr.write(f"Model: {model_name}\n")
|
| 104 |
-
sys.stderr.write(f"Revision: {revision}\n")
|
| 105 |
-
sys.stderr.write(f"Precision: {precision}\n")
|
| 106 |
-
sys.stderr.flush()
|
| 107 |
-
|
| 108 |
-
success, result = run_dynamic_perplexity_eval(model_name, revision, precision)
|
| 109 |
-
sys.stderr.write(f"Evaluation result - Success: {success}, Result: {result}\n")
|
| 110 |
-
sys.stderr.flush()
|
| 111 |
-
|
| 112 |
-
if success:
|
| 113 |
-
sys.stderr.write("Evaluation succeeded - updating both results tables\n")
|
| 114 |
-
sys.stderr.flush()
|
| 115 |
-
|
| 116 |
-
# Get updated results (this will trigger model trace p-value computation for the new model)
|
| 117 |
-
sys.stderr.write("🔄 Creating updated results DataFrame (may compute model trace p-values)...\n")
|
| 118 |
-
sys.stderr.flush()
|
| 119 |
-
|
| 120 |
-
updated_df = create_results_dataframe()
|
| 121 |
-
|
| 122 |
-
sys.stderr.write("✅ Updated DataFrame created successfully\n")
|
| 123 |
-
sys.stderr.flush()
|
| 124 |
-
|
| 125 |
-
success_msg = f"""✅ **Perplexity evaluation completed successfully!**
|
| 126 |
-
|
| 127 |
-
**Model**: {model_name}
|
| 128 |
-
**Perplexity Score**: {result:.4f}
|
| 129 |
-
|
| 130 |
-
🎉 **Results have been saved and both tables have been updated!**
|
| 131 |
-
|
| 132 |
-
⏰ **Note**: Model trace p-value computation runs a full model comparison analysis and may take 10-30 minutes per model. Progress will appear in the logs."""
|
| 133 |
-
|
| 134 |
-
return success_msg, gr.update(value=updated_df), gr.update(value=updated_df)
|
| 135 |
-
else:
|
| 136 |
-
return f"❌ **Evaluation failed**: {result}", gr.update(), gr.update()
|
| 137 |
-
|
| 138 |
-
except Exception as e:
|
| 139 |
-
error_msg = str(e)
|
| 140 |
-
traceback_str = traceback.format_exc()
|
| 141 |
-
sys.stderr.write(f"Critical error in run_perplexity_test: {error_msg}\n")
|
| 142 |
-
sys.stderr.write(f"Traceback: {traceback_str}\n")
|
| 143 |
-
sys.stderr.flush()
|
| 144 |
-
return f"❌ **Critical error**: {error_msg}", gr.update(), gr.update()
|
| 145 |
|
| 146 |
# Initialize results repository and directory
|
| 147 |
try:
|
|
@@ -173,7 +113,7 @@ except Exception as e:
|
|
| 173 |
|
| 174 |
# Initialize allowed models
|
| 175 |
import sys
|
| 176 |
-
from src.evaluation.initialize_models import initialize_allowed_models
|
| 177 |
|
| 178 |
sys.stderr.write("\n🚀 STARTING GRADIO APP INITIALIZATION\n")
|
| 179 |
sys.stderr.write("📊 Initializing allowed models...\n")
|
|
@@ -205,7 +145,7 @@ with demo:
|
|
| 205 |
gr.Markdown("## Model Evaluation Results")
|
| 206 |
results_table = gr.DataFrame(
|
| 207 |
value=RESULTS_DF,
|
| 208 |
-
headers=["Model", "
|
| 209 |
interactive=False,
|
| 210 |
wrap=False
|
| 211 |
)
|
|
@@ -213,66 +153,35 @@ with demo:
|
|
| 213 |
with gr.TabItem("📝 About", elem_id="about-tab", id=1):
|
| 214 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 215 |
|
| 216 |
-
with gr.TabItem("
|
| 217 |
-
gr.Markdown("##
|
| 218 |
-
|
| 219 |
-
allowed_models = get_allowed_models()
|
| 220 |
-
|
| 221 |
-
with gr.Row():
|
| 222 |
-
with gr.Column():
|
| 223 |
-
model_name = gr.Dropdown(
|
| 224 |
-
choices=allowed_models,
|
| 225 |
-
label="Model name",
|
| 226 |
-
value=allowed_models[0] if allowed_models else None
|
| 227 |
-
)
|
| 228 |
-
revision = gr.Textbox(label="Revision", placeholder="main", value="main")
|
| 229 |
-
precision = gr.Dropdown(
|
| 230 |
-
choices=["float16", "bfloat16"],
|
| 231 |
-
label="Precision",
|
| 232 |
-
value="float16"
|
| 233 |
-
)
|
| 234 |
-
debug_mode = gr.Checkbox(label="Enable debug mode (more verbose logging)", value=True)
|
| 235 |
-
|
| 236 |
-
with gr.Column():
|
| 237 |
-
test_button = gr.Button("🚀 Run Perplexity Test", variant="primary")
|
| 238 |
-
result = gr.Markdown()
|
| 239 |
-
|
| 240 |
-
gr.Markdown("## Live Results")
|
| 241 |
-
live_results_table = gr.DataFrame(
|
| 242 |
-
value=RESULTS_DF,
|
| 243 |
-
headers=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"],
|
| 244 |
-
interactive=False,
|
| 245 |
-
wrap=False
|
| 246 |
-
)
|
| 247 |
|
| 248 |
gr.Markdown("""
|
| 249 |
-
###
|
| 250 |
-
- **
|
| 251 |
-
- **
|
| 252 |
-
- **
|
| 253 |
-
- **
|
| 254 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
|
| 256 |
### How it works:
|
| 257 |
-
1.
|
| 258 |
-
2.
|
| 259 |
-
3.
|
| 260 |
-
4. Results
|
| 261 |
""")
|
| 262 |
-
|
| 263 |
-
test_button.click(
|
| 264 |
-
run_perplexity_test,
|
| 265 |
-
[model_name, revision, precision],
|
| 266 |
-
[result, live_results_table, results_table]
|
| 267 |
-
)
|
| 268 |
|
| 269 |
sys.stderr.write("🎯 GRADIO INTERFACE SETUP COMPLETE\n")
|
| 270 |
-
sys.stderr.write("🚀 LAUNCHING GRADIO APP WITH MODEL TRACING
|
| 271 |
sys.stderr.write("📊 Features enabled:\n")
|
| 272 |
-
sys.stderr.write(" -
|
| 273 |
-
sys.stderr.write(" - Model trace p-value computation (vs GPT-2 base)\n")
|
| 274 |
sys.stderr.write(" - Match statistic with alignment\n")
|
| 275 |
-
sys.stderr.write("
|
|
|
|
| 276 |
sys.stderr.flush()
|
| 277 |
|
| 278 |
demo.queue(default_concurrency_limit=5).launch()
|
|
|
|
| 18 |
)
|
| 19 |
from src.envs import API, EVAL_RESULTS_PATH, RESULTS_REPO, TOKEN, OWNER
|
| 20 |
from src.populate import get_leaderboard_df
|
|
|
|
| 21 |
|
| 22 |
def create_results_dataframe():
|
| 23 |
"""Create and return the results DataFrame for display"""
|
|
|
|
| 35 |
sys.stderr.write("⚠️ DataFrame is None or empty, returning empty DataFrame\n")
|
| 36 |
sys.stderr.flush()
|
| 37 |
# Return empty DataFrame with proper columns
|
| 38 |
+
return pd.DataFrame(columns=["Model", "Match P-Value", "Type", "Precision"])
|
| 39 |
|
| 40 |
sys.stderr.write(f"📊 Original DataFrame columns: {list(df.columns)}\n")
|
| 41 |
sys.stderr.flush()
|
| 42 |
|
| 43 |
+
# Check if required columns exist - only p-values matter
|
| 44 |
required_cols = [
|
| 45 |
AutoEvalColumn.model.name,
|
|
|
|
| 46 |
AutoEvalColumn.model_trace_p_value.name,
|
|
|
|
| 47 |
AutoEvalColumn.model_type.name,
|
| 48 |
AutoEvalColumn.precision.name,
|
| 49 |
]
|
|
|
|
| 65 |
except Exception as e:
|
| 66 |
sys.stderr.write(f"💥 Error selecting columns: {e}\n")
|
| 67 |
sys.stderr.flush()
|
| 68 |
+
return pd.DataFrame(columns=["Model", "Match P-Value", "Type", "Precision"])
|
| 69 |
|
| 70 |
# Rename columns for better display
|
| 71 |
+
display_df.columns = ["Model", "Match P-Value", "Type", "Precision"]
|
| 72 |
|
| 73 |
sys.stderr.write(f"🎯 Final display DataFrame shape: {display_df.shape}\n")
|
| 74 |
sys.stderr.write(f"🎯 Final columns: {list(display_df.columns)}\n")
|
|
|
|
| 81 |
sys.stderr.flush()
|
| 82 |
return display_df
|
| 83 |
|
| 84 |
+
# Perplexity testing removed - we only focus on p-values now
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
# Initialize results repository and directory
|
| 87 |
try:
|
|
|
|
| 113 |
|
| 114 |
# Initialize allowed models
|
| 115 |
import sys
|
| 116 |
+
from src.evaluation.initialize_models import initialize_allowed_models
|
| 117 |
|
| 118 |
sys.stderr.write("\n🚀 STARTING GRADIO APP INITIALIZATION\n")
|
| 119 |
sys.stderr.write("📊 Initializing allowed models...\n")
|
|
|
|
| 145 |
gr.Markdown("## Model Evaluation Results")
|
| 146 |
results_table = gr.DataFrame(
|
| 147 |
value=RESULTS_DF,
|
| 148 |
+
headers=["Model", "Match P-Value", "Type", "Precision"],
|
| 149 |
interactive=False,
|
| 150 |
wrap=False
|
| 151 |
)
|
|
|
|
| 153 |
with gr.TabItem("📝 About", elem_id="about-tab", id=1):
|
| 154 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 155 |
|
| 156 |
+
with gr.TabItem("🔬 Analysis", elem_id="analysis-tab", id=2):
|
| 157 |
+
gr.Markdown("## Model Tracing Analysis\n\nP-values are computed automatically for all supported models.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
|
| 159 |
gr.Markdown("""
|
| 160 |
+
### Current Analysis Status:
|
| 161 |
+
- **P-values are computed automatically** using the model tracing pipeline
|
| 162 |
+
- **Lower p-values indicate higher structural similarity** to Llama-2-7B
|
| 163 |
+
- **Analysis compares neuron organization** across transformer layers
|
| 164 |
+
- **Results appear in the main table** once computation is complete
|
| 165 |
+
|
| 166 |
+
### Supported Models:
|
| 167 |
+
- `lmsys/vicuna-7b-v1.5` - Vicuna 7B v1.5
|
| 168 |
+
- `ibm-granite/granite-7b-base` - IBM Granite 7B Base
|
| 169 |
+
- `EleutherAI/llemma_7b` - LLeMa 7B
|
| 170 |
|
| 171 |
### How it works:
|
| 172 |
+
1. Models are automatically analyzed against Llama-2-7B base
|
| 173 |
+
2. Match statistic with alignment is computed
|
| 174 |
+
3. P-values indicate structural similarity preservation
|
| 175 |
+
4. Results appear in the main Results tab
|
| 176 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
|
| 178 |
sys.stderr.write("🎯 GRADIO INTERFACE SETUP COMPLETE\n")
|
| 179 |
+
sys.stderr.write("🚀 LAUNCHING GRADIO APP WITH MODEL TRACING ANALYSIS\n")
|
| 180 |
sys.stderr.write("📊 Features enabled:\n")
|
| 181 |
+
sys.stderr.write(" - Model trace p-value computation (vs Llama-2-7B base)\n")
|
|
|
|
| 182 |
sys.stderr.write(" - Match statistic with alignment\n")
|
| 183 |
+
sys.stderr.write(" - Structural similarity analysis\n")
|
| 184 |
+
sys.stderr.write("🎉 Ready to display p-values!\n")
|
| 185 |
sys.stderr.flush()
|
| 186 |
|
| 187 |
demo.queue(default_concurrency_limit=5).launch()
|
src/about.py
CHANGED
|
@@ -10,10 +10,10 @@ class Task:
|
|
| 10 |
# Select your tasks here
|
| 11 |
# ---------------------------------------------------
|
| 12 |
class Tasks(Enum):
|
| 13 |
-
#
|
| 14 |
-
|
| 15 |
|
| 16 |
-
NUM_FEWSHOT = 0 # Not used
|
| 17 |
# ---------------------------------------------------
|
| 18 |
|
| 19 |
# Your leaderboard name
|
|
@@ -29,8 +29,7 @@ structural similarity to Llama-2-7B using model tracing analysis.
|
|
| 29 |
- `ibm-granite/granite-7b-base` - IBM Granite 7B Base
|
| 30 |
- `EleutherAI/llemma_7b` - LLeMa 7B
|
| 31 |
|
| 32 |
-
**
|
| 33 |
-
- **Perplexity**: Lower perplexity scores indicate better performance - it means the model is better at predicting the next token in the text.
|
| 34 |
- **Match P-Value**: Lower p-values indicate the model preserves structural similarity to Llama-2-7B after fine-tuning (neuron organization is maintained).
|
| 35 |
"""
|
| 36 |
|
|
@@ -38,18 +37,14 @@ structural similarity to Llama-2-7B using model tracing analysis.
|
|
| 38 |
LLM_BENCHMARKS_TEXT = """
|
| 39 |
## How it works
|
| 40 |
|
| 41 |
-
The evaluation runs
|
| 42 |
|
| 43 |
### Supported Models
|
| 44 |
- **Vicuna 7B v1.5** (`lmsys/vicuna-7b-v1.5`) - Chat-optimized LLaMA variant
|
| 45 |
- **IBM Granite 7B** (`ibm-granite/granite-7b-base`) - IBM's foundational language model
|
| 46 |
- **LLeMa 7B** (`EleutherAI/llemma_7b`) - EleutherAI's mathematical language model
|
| 47 |
|
| 48 |
-
###
|
| 49 |
-
Perplexity tests using a fixed test passage about artificial intelligence.
|
| 50 |
-
Perplexity measures how well a model predicts text - lower scores mean better predictions.
|
| 51 |
-
|
| 52 |
-
### 2. Model Tracing Analysis
|
| 53 |
Compares each model's internal structure to Llama-2-7B using the "match" statistic:
|
| 54 |
- **Base Model**: Llama-2-7B (`meta-llama/Llama-2-7b-hf`)
|
| 55 |
- **Comparison Models**: The 3 supported models listed above
|
|
@@ -59,29 +54,18 @@ Compares each model's internal structure to Llama-2-7B using the "match" statist
|
|
| 59 |
|
| 60 |
The match statistic tests whether neurons in corresponding layers maintain similar functional roles
|
| 61 |
between the base model and the comparison models.
|
| 62 |
-
|
| 63 |
-
## Test Text
|
| 64 |
-
|
| 65 |
-
The evaluation uses the following passage:
|
| 66 |
-
```
|
| 67 |
-
Artificial intelligence has transformed the way we live and work, bringing both opportunities and challenges.
|
| 68 |
-
From autonomous vehicles to language models that can engage in human-like conversation, AI technologies are becoming increasingly
|
| 69 |
-
sophisticated. However, with this advancement comes the responsibility to ensure these systems are developed and deployed ethically,
|
| 70 |
-
with careful consideration for privacy, fairness, and transparency. The future of AI will likely depend on how well we balance innovation
|
| 71 |
-
with these important social considerations.
|
| 72 |
-
```
|
| 73 |
"""
|
| 74 |
|
| 75 |
EVALUATION_QUEUE_TEXT = """
|
| 76 |
-
##
|
| 77 |
|
| 78 |
-
This leaderboard
|
| 79 |
|
| 80 |
1. **Vicuna 7B v1.5** - Chat-optimized variant of LLaMA
|
| 81 |
2. **IBM Granite 7B Base** - IBM's foundational language model
|
| 82 |
3. **LLeMa 7B** - EleutherAI's mathematical language model
|
| 83 |
|
| 84 |
-
|
| 85 |
"""
|
| 86 |
|
| 87 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
|
|
|
| 10 |
# Select your tasks here
|
| 11 |
# ---------------------------------------------------
|
| 12 |
class Tasks(Enum):
|
| 13 |
+
# No tasks - we only care about p-values
|
| 14 |
+
pass
|
| 15 |
|
| 16 |
+
NUM_FEWSHOT = 0 # Not used
|
| 17 |
# ---------------------------------------------------
|
| 18 |
|
| 19 |
# Your leaderboard name
|
|
|
|
| 29 |
- `ibm-granite/granite-7b-base` - IBM Granite 7B Base
|
| 30 |
- `EleutherAI/llemma_7b` - LLeMa 7B
|
| 31 |
|
| 32 |
+
**Metric:**
|
|
|
|
| 33 |
- **Match P-Value**: Lower p-values indicate the model preserves structural similarity to Llama-2-7B after fine-tuning (neuron organization is maintained).
|
| 34 |
"""
|
| 35 |
|
|
|
|
| 37 |
LLM_BENCHMARKS_TEXT = """
|
| 38 |
## How it works
|
| 39 |
|
| 40 |
+
The evaluation runs model tracing analysis on the supported language models:
|
| 41 |
|
| 42 |
### Supported Models
|
| 43 |
- **Vicuna 7B v1.5** (`lmsys/vicuna-7b-v1.5`) - Chat-optimized LLaMA variant
|
| 44 |
- **IBM Granite 7B** (`ibm-granite/granite-7b-base`) - IBM's foundational language model
|
| 45 |
- **LLeMa 7B** (`EleutherAI/llemma_7b`) - EleutherAI's mathematical language model
|
| 46 |
|
| 47 |
+
### Model Tracing Analysis
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
Compares each model's internal structure to Llama-2-7B using the "match" statistic:
|
| 49 |
- **Base Model**: Llama-2-7B (`meta-llama/Llama-2-7b-hf`)
|
| 50 |
- **Comparison Models**: The 3 supported models listed above
|
|
|
|
| 54 |
|
| 55 |
The match statistic tests whether neurons in corresponding layers maintain similar functional roles
|
| 56 |
between the base model and the comparison models.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
"""
|
| 58 |
|
| 59 |
EVALUATION_QUEUE_TEXT = """
|
| 60 |
+
## Model Analysis
|
| 61 |
|
| 62 |
+
This leaderboard analyzes structural similarity between specific models and Llama-2-7B:
|
| 63 |
|
| 64 |
1. **Vicuna 7B v1.5** - Chat-optimized variant of LLaMA
|
| 65 |
2. **IBM Granite 7B Base** - IBM's foundational language model
|
| 66 |
3. **LLeMa 7B** - EleutherAI's mathematical language model
|
| 67 |
|
| 68 |
+
The p-values are computed automatically using the model tracing analysis.
|
| 69 |
"""
|
| 70 |
|
| 71 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
src/display/utils.py
CHANGED
|
@@ -26,15 +26,7 @@ auto_eval_column_dict = []
|
|
| 26 |
# Init
|
| 27 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 28 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 29 |
-
#
|
| 30 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
| 31 |
-
for task in Tasks:
|
| 32 |
-
# Use exact column name from Tasks
|
| 33 |
-
task_col_name = task.value.col_name
|
| 34 |
-
sys.stderr.write(f"Adding task column: {task.name} -> column name: {task_col_name}\n")
|
| 35 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task_col_name, "number", True)])
|
| 36 |
-
sys.stderr.flush()
|
| 37 |
-
# Model tracing p-value column
|
| 38 |
auto_eval_column_dict.append(["model_trace_p_value", ColumnContent, ColumnContent("Match P-Value ⬇️", "number", True)])
|
| 39 |
# Model information
|
| 40 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
|
@@ -122,7 +114,7 @@ sys.stderr.write(f"COLS: {COLS}\n")
|
|
| 122 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 123 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 124 |
|
| 125 |
-
BENCHMARK_COLS = [
|
| 126 |
sys.stderr.write(f"BENCHMARK_COLS: {BENCHMARK_COLS}\n")
|
| 127 |
sys.stderr.write(f"=== END COLUMN SETUP ===\n")
|
| 128 |
sys.stderr.flush()
|
|
|
|
| 26 |
# Init
|
| 27 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 28 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 29 |
+
# Only p-value column - no other scores
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
auto_eval_column_dict.append(["model_trace_p_value", ColumnContent, ColumnContent("Match P-Value ⬇️", "number", True)])
|
| 31 |
# Model information
|
| 32 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
|
|
|
| 114 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 115 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 116 |
|
| 117 |
+
BENCHMARK_COLS = [] # No benchmark columns - only p-values
|
| 118 |
sys.stderr.write(f"BENCHMARK_COLS: {BENCHMARK_COLS}\n")
|
| 119 |
sys.stderr.write(f"=== END COLUMN SETUP ===\n")
|
| 120 |
sys.stderr.flush()
|
src/evaluation/initialize_models.py
CHANGED
|
@@ -53,9 +53,7 @@ def create_model_result_file(model_name, precision="float16"):
|
|
| 53 |
"model_sha": "main"
|
| 54 |
},
|
| 55 |
"results": {
|
| 56 |
-
|
| 57 |
-
"perplexity": None # Will be populated when user tests
|
| 58 |
-
}
|
| 59 |
}
|
| 60 |
}
|
| 61 |
|
|
|
|
| 53 |
"model_sha": "main"
|
| 54 |
},
|
| 55 |
"results": {
|
| 56 |
+
# No perplexity - we only care about p-values
|
|
|
|
|
|
|
| 57 |
}
|
| 58 |
}
|
| 59 |
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -59,10 +59,8 @@ class EvalResult:
|
|
| 59 |
if architectures:
|
| 60 |
architecture = ";".join(architectures)
|
| 61 |
|
| 62 |
-
#
|
| 63 |
results = {}
|
| 64 |
-
if "perplexity" in data["results"]:
|
| 65 |
-
results["perplexity"] = data["results"]["perplexity"]["perplexity"]
|
| 66 |
|
| 67 |
return self(
|
| 68 |
eval_name=result_key,
|
|
@@ -88,29 +86,9 @@ class EvalResult:
|
|
| 88 |
sys.stderr.write(f"Weight type: {self.weight_type}\n")
|
| 89 |
sys.stderr.flush()
|
| 90 |
|
| 91 |
-
#
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
sys.stderr.write(f"Available tasks: {[task.name for task in Tasks]}\n")
|
| 95 |
-
|
| 96 |
-
for task in Tasks:
|
| 97 |
-
sys.stderr.write(f"Looking for task: {task.value.benchmark} in results\n")
|
| 98 |
-
if task.value.benchmark in self.results:
|
| 99 |
-
score = self.results[task.value.benchmark]
|
| 100 |
-
perplexity_score = score # Save the raw score
|
| 101 |
-
sys.stderr.write(f"Found score for {task.value.benchmark}: {score}\n")
|
| 102 |
-
# Convert perplexity to a 0-100 scale where lower perplexity = higher score
|
| 103 |
-
# Using a log scale since perplexity can vary widely
|
| 104 |
-
# Cap at 100 for very low perplexity and 0 for very high perplexity
|
| 105 |
-
score = max(0, min(100, 100 * (1 - math.log(score) / 10)))
|
| 106 |
-
scores.append(score)
|
| 107 |
-
sys.stderr.write(f"Converted score: {score}\n")
|
| 108 |
-
else:
|
| 109 |
-
sys.stderr.write(f"Task {task.value.benchmark} not found in results\n")
|
| 110 |
-
sys.stderr.flush()
|
| 111 |
-
|
| 112 |
-
average = sum(scores) / len(scores) if scores else 0
|
| 113 |
-
sys.stderr.write(f"Calculated average score: {average}\n")
|
| 114 |
sys.stderr.flush()
|
| 115 |
|
| 116 |
# Create data dictionary with comprehensive debugging
|
|
@@ -164,17 +142,9 @@ class EvalResult:
|
|
| 164 |
sys.stderr.write(f"Created base data_dict with {len(data_dict)} columns\n")
|
| 165 |
sys.stderr.flush()
|
| 166 |
|
| 167 |
-
#
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
if task.value.benchmark in self.results:
|
| 171 |
-
task_score = self.results[task.value.benchmark]
|
| 172 |
-
data_dict[task_col_name] = task_score
|
| 173 |
-
sys.stderr.write(f"Added task score: {task_col_name} = {task_score}\n")
|
| 174 |
-
else:
|
| 175 |
-
data_dict[task_col_name] = None
|
| 176 |
-
sys.stderr.write(f"Added None for missing task: {task_col_name}\n")
|
| 177 |
-
sys.stderr.flush()
|
| 178 |
|
| 179 |
sys.stderr.write(f"Final data dict has {len(data_dict)} columns: {list(data_dict.keys())}\n")
|
| 180 |
sys.stderr.write(f"=== END PROCESSING RESULT TO_DICT ===\n")
|
|
|
|
| 59 |
if architectures:
|
| 60 |
architecture = ";".join(architectures)
|
| 61 |
|
| 62 |
+
# No perplexity extraction - we only care about p-values
|
| 63 |
results = {}
|
|
|
|
|
|
|
| 64 |
|
| 65 |
return self(
|
| 66 |
eval_name=result_key,
|
|
|
|
| 86 |
sys.stderr.write(f"Weight type: {self.weight_type}\n")
|
| 87 |
sys.stderr.flush()
|
| 88 |
|
| 89 |
+
# No task-based scoring - we only care about p-values
|
| 90 |
+
average = 0 # Default average since we don't have tasks
|
| 91 |
+
sys.stderr.write(f"No task-based scoring, using default average: {average}\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
sys.stderr.flush()
|
| 93 |
|
| 94 |
# Create data dictionary with comprehensive debugging
|
|
|
|
| 142 |
sys.stderr.write(f"Created base data_dict with {len(data_dict)} columns\n")
|
| 143 |
sys.stderr.flush()
|
| 144 |
|
| 145 |
+
# No task-specific scores - we only have p-values
|
| 146 |
+
sys.stderr.write("No task-specific scores to add\n")
|
| 147 |
+
sys.stderr.flush()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
sys.stderr.write(f"Final data dict has {len(data_dict)} columns: {list(data_dict.keys())}\n")
|
| 150 |
sys.stderr.write(f"=== END PROCESSING RESULT TO_DICT ===\n")
|