model_trace

Runtime error

App Files Files Community

Ahmed Ahmed commited on Jul 26

Commit

1191811

1 Parent(s): 1bac1ed

try again

Browse files

Files changed (5) hide show

app.py +27 -118
src/about.py +9 -25
src/display/utils.py +2 -10
src/evaluation/initialize_models.py +1 -3
src/leaderboard/read_evals.py +7 -37

app.py CHANGED Viewed

@@ -18,7 +18,6 @@ from src.display.utils import (
 )
 from src.envs import API, EVAL_RESULTS_PATH, RESULTS_REPO, TOKEN, OWNER
 from src.populate import get_leaderboard_df
-from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval
 def create_results_dataframe():
     """Create and return the results DataFrame for display"""
@@ -36,17 +35,15 @@ def create_results_dataframe():
         sys.stderr.write("⚠️ DataFrame is None or empty, returning empty DataFrame\n")
         sys.stderr.flush()
         # Return empty DataFrame with proper columns
-        return pd.DataFrame(columns=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"])
     sys.stderr.write(f"📊 Original DataFrame columns: {list(df.columns)}\n")
     sys.stderr.flush()
-    # Check if required columns exist
     required_cols = [
         AutoEvalColumn.model.name,
-        "Perplexity",
         AutoEvalColumn.model_trace_p_value.name,
-        AutoEvalColumn.average.name,
         AutoEvalColumn.model_type.name,
         AutoEvalColumn.precision.name,
     ]
@@ -68,10 +65,10 @@ def create_results_dataframe():
     except Exception as e:
         sys.stderr.write(f"💥 Error selecting columns: {e}\n")
         sys.stderr.flush()
-        return pd.DataFrame(columns=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"])
     # Rename columns for better display
-    display_df.columns = ["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"]
     sys.stderr.write(f"🎯 Final display DataFrame shape: {display_df.shape}\n")
     sys.stderr.write(f"🎯 Final columns: {list(display_df.columns)}\n")
@@ -84,64 +81,7 @@ def create_results_dataframe():
     sys.stderr.flush()
     return display_df
-def run_perplexity_test(model_name, revision, precision):
-    """Run perplexity evaluation on demand."""
-    import sys
-    import traceback
-    import gradio as gr
-    from src.evaluation.initialize_models import is_model_allowed
-    if not model_name:
-        return "Please select a model.", gr.update(), gr.update()
-    if not is_model_allowed(model_name):
-        return f"❌ Model '{model_name}' is not in the allowed list. Please select from the dropdown.", gr.update(), gr.update()
-    try:
-        # Use stderr for more reliable logging in HF Spaces
-        sys.stderr.write(f"\n=== RUNNING PERPLEXITY TEST ===\n")
-        sys.stderr.write(f"Model: {model_name}\n")
-        sys.stderr.write(f"Revision: {revision}\n")
-        sys.stderr.write(f"Precision: {precision}\n")
-        sys.stderr.flush()
-        success, result = run_dynamic_perplexity_eval(model_name, revision, precision)
-        sys.stderr.write(f"Evaluation result - Success: {success}, Result: {result}\n")
-        sys.stderr.flush()
-        if success:
-            sys.stderr.write("Evaluation succeeded - updating both results tables\n")
-            sys.stderr.flush()
-            # Get updated results (this will trigger model trace p-value computation for the new model)
-            sys.stderr.write("🔄 Creating updated results DataFrame (may compute model trace p-values)...\n")
-            sys.stderr.flush()
-            updated_df = create_results_dataframe()
-            sys.stderr.write("✅ Updated DataFrame created successfully\n")
-            sys.stderr.flush()
-            success_msg = f"""✅ **Perplexity evaluation completed successfully!**
-**Model**: {model_name}
-**Perplexity Score**: {result:.4f}
-🎉 **Results have been saved and both tables have been updated!**
-⏰ **Note**: Model trace p-value computation runs a full model comparison analysis and may take 10-30 minutes per model. Progress will appear in the logs."""
-            return success_msg, gr.update(value=updated_df), gr.update(value=updated_df)
-        else:
-            return f"❌ **Evaluation failed**: {result}", gr.update(), gr.update()
-    except Exception as e:
-        error_msg = str(e)
-        traceback_str = traceback.format_exc()
-        sys.stderr.write(f"Critical error in run_perplexity_test: {error_msg}\n")
-        sys.stderr.write(f"Traceback: {traceback_str}\n")
-        sys.stderr.flush()
-        return f"❌ **Critical error**: {error_msg}", gr.update(), gr.update()
 # Initialize results repository and directory
 try:
@@ -173,7 +113,7 @@ except Exception as e:
 # Initialize allowed models
 import sys
-from src.evaluation.initialize_models import initialize_allowed_models, get_allowed_models
 sys.stderr.write("\n🚀 STARTING GRADIO APP INITIALIZATION\n")
 sys.stderr.write("📊 Initializing allowed models...\n")
@@ -205,7 +145,7 @@ with demo:
             gr.Markdown("## Model Evaluation Results")
             results_table = gr.DataFrame(
                 value=RESULTS_DF,
-                headers=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"],
                 interactive=False,
                 wrap=False
             )
@@ -213,66 +153,35 @@ with demo:
         with gr.TabItem("📝 About", elem_id="about-tab", id=1):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🧪 Test Model", elem_id="test-model-tab", id=2):
-            gr.Markdown("## Run Perplexity Test\n\nTest one of the supported models for perplexity evaluation.")
-            allowed_models = get_allowed_models()
-            with gr.Row():
-                with gr.Column():
-                    model_name = gr.Dropdown(
-                        choices=allowed_models,
-                        label="Model name",
-                        value=allowed_models[0] if allowed_models else None
-                    )
-                    revision = gr.Textbox(label="Revision", placeholder="main", value="main")
-                    precision = gr.Dropdown(
-                        choices=["float16", "bfloat16"],
-                        label="Precision",
-                        value="float16"
-                    )
-                    debug_mode = gr.Checkbox(label="Enable debug mode (more verbose logging)", value=True)
-                with gr.Column():
-                    test_button = gr.Button("🚀 Run Perplexity Test", variant="primary")
-                    result = gr.Markdown()
-            gr.Markdown("## Live Results")
-            live_results_table = gr.DataFrame(
-                value=RESULTS_DF,
-                headers=["Model", "Perplexity", "Match P-Value", "Average Score", "Type", "Precision"],
-                interactive=False,
-                wrap=False
-            )
             gr.Markdown("""
-            ### Tips:
-            - **Check stderr logs** in HF Spaces for detailed debugging information
-            - **Results will update automatically** in the table above after evaluation completes
-            - **Available models**: Vicuna 7B v1.5, IBM Granite 7B Base, LLeMa 7B
-            - **Lower perplexity scores = better performance** (better at predicting text)
-            - **Model trace p-values are computed automatically** (may take 10-30 minutes)
             ### How it works:
-            1. Select a model from the dropdown
-            2. Click "Run Perplexity Test"
-            3. Wait for evaluation to complete (may take a few minutes for perplexity + longer for p-value)
-            4. Results will appear automatically in the table above!
             """)
-            test_button.click(
-                run_perplexity_test,
-                [model_name, revision, precision],
-                [result, live_results_table, results_table]
-            )
 sys.stderr.write("🎯 GRADIO INTERFACE SETUP COMPLETE\n")
-sys.stderr.write("🚀 LAUNCHING GRADIO APP WITH MODEL TRACING INTEGRATION\n")
 sys.stderr.write("📊 Features enabled:\n")
-sys.stderr.write("   - Perplexity evaluation\n")
-sys.stderr.write("   - Model trace p-value computation (vs GPT-2 base)\n")
 sys.stderr.write("   - Match statistic with alignment\n")
-sys.stderr.write("🎉 Ready to accept requests!\n")
 sys.stderr.flush()
 demo.queue(default_concurrency_limit=5).launch()

 )
 from src.envs import API, EVAL_RESULTS_PATH, RESULTS_REPO, TOKEN, OWNER
 from src.populate import get_leaderboard_df
 def create_results_dataframe():
     """Create and return the results DataFrame for display"""
         sys.stderr.write("⚠️ DataFrame is None or empty, returning empty DataFrame\n")
         sys.stderr.flush()
         # Return empty DataFrame with proper columns
+        return pd.DataFrame(columns=["Model", "Match P-Value", "Type", "Precision"])
     sys.stderr.write(f"📊 Original DataFrame columns: {list(df.columns)}\n")
     sys.stderr.flush()
+    # Check if required columns exist - only p-values matter
     required_cols = [
         AutoEvalColumn.model.name,
         AutoEvalColumn.model_trace_p_value.name,
         AutoEvalColumn.model_type.name,
         AutoEvalColumn.precision.name,
     ]
     except Exception as e:
         sys.stderr.write(f"💥 Error selecting columns: {e}\n")
         sys.stderr.flush()
+        return pd.DataFrame(columns=["Model", "Match P-Value", "Type", "Precision"])
     # Rename columns for better display
+    display_df.columns = ["Model", "Match P-Value", "Type", "Precision"]
     sys.stderr.write(f"🎯 Final display DataFrame shape: {display_df.shape}\n")
     sys.stderr.write(f"🎯 Final columns: {list(display_df.columns)}\n")
     sys.stderr.flush()
     return display_df
+# Perplexity testing removed - we only focus on p-values now
 # Initialize results repository and directory
 try:
 # Initialize allowed models
 import sys
+from src.evaluation.initialize_models import initialize_allowed_models
 sys.stderr.write("\n🚀 STARTING GRADIO APP INITIALIZATION\n")
 sys.stderr.write("📊 Initializing allowed models...\n")
             gr.Markdown("## Model Evaluation Results")
             results_table = gr.DataFrame(
                 value=RESULTS_DF,
+                headers=["Model", "Match P-Value", "Type", "Precision"],
                 interactive=False,
                 wrap=False
             )
         with gr.TabItem("📝 About", elem_id="about-tab", id=1):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("🔬 Analysis", elem_id="analysis-tab", id=2):
+            gr.Markdown("## Model Tracing Analysis\n\nP-values are computed automatically for all supported models.")
             gr.Markdown("""
+            ### Current Analysis Status:
+            - **P-values are computed automatically** using the model tracing pipeline
+            - **Lower p-values indicate higher structural similarity** to Llama-2-7B
+            - **Analysis compares neuron organization** across transformer layers
+            - **Results appear in the main table** once computation is complete
+            ### Supported Models:
+            - `lmsys/vicuna-7b-v1.5` - Vicuna 7B v1.5
+            - `ibm-granite/granite-7b-base` - IBM Granite 7B Base
+            - `EleutherAI/llemma_7b` - LLeMa 7B
             ### How it works:
+            1. Models are automatically analyzed against Llama-2-7B base
+            2. Match statistic with alignment is computed
+            3. P-values indicate structural similarity preservation
+            4. Results appear in the main Results tab
             """)
 sys.stderr.write("🎯 GRADIO INTERFACE SETUP COMPLETE\n")
+sys.stderr.write("🚀 LAUNCHING GRADIO APP WITH MODEL TRACING ANALYSIS\n")
 sys.stderr.write("📊 Features enabled:\n")
+sys.stderr.write("   - Model trace p-value computation (vs Llama-2-7B base)\n")
 sys.stderr.write("   - Match statistic with alignment\n")
+sys.stderr.write("   - Structural similarity analysis\n")
+sys.stderr.write("🎉 Ready to display p-values!\n")
 sys.stderr.flush()
 demo.queue(default_concurrency_limit=5).launch()

src/about.py CHANGED Viewed

@@ -10,10 +10,10 @@ class Task:
 # Select your tasks here
 # ---------------------------------------------------
 class Tasks(Enum):
-    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("perplexity", "perplexity", "Perplexity")
-NUM_FEWSHOT = 0 # Not used for perplexity
 # ---------------------------------------------------
 # Your leaderboard name
@@ -29,8 +29,7 @@ structural similarity to Llama-2-7B using model tracing analysis.
 - `ibm-granite/granite-7b-base` - IBM Granite 7B Base
 - `EleutherAI/llemma_7b` - LLeMa 7B
-**Metrics:**
-- **Perplexity**: Lower perplexity scores indicate better performance - it means the model is better at predicting the next token in the text.
 - **Match P-Value**: Lower p-values indicate the model preserves structural similarity to Llama-2-7B after fine-tuning (neuron organization is maintained).
 """
@@ -38,18 +37,14 @@ structural similarity to Llama-2-7B using model tracing analysis.
 LLM_BENCHMARKS_TEXT = """
 ## How it works
-The evaluation runs two types of analysis on the supported language models:
 ### Supported Models
 - **Vicuna 7B v1.5** (`lmsys/vicuna-7b-v1.5`) - Chat-optimized LLaMA variant
 - **IBM Granite 7B** (`ibm-granite/granite-7b-base`) - IBM's foundational language model
 - **LLeMa 7B** (`EleutherAI/llemma_7b`) - EleutherAI's mathematical language model
-### 1. Perplexity Evaluation
-Perplexity tests using a fixed test passage about artificial intelligence.
-Perplexity measures how well a model predicts text - lower scores mean better predictions.
-### 2. Model Tracing Analysis
 Compares each model's internal structure to Llama-2-7B using the "match" statistic:
 - **Base Model**: Llama-2-7B (`meta-llama/Llama-2-7b-hf`)
 - **Comparison Models**: The 3 supported models listed above
@@ -59,29 +54,18 @@ Compares each model's internal structure to Llama-2-7B using the "match" statist
 The match statistic tests whether neurons in corresponding layers maintain similar functional roles
 between the base model and the comparison models.
-## Test Text
-The evaluation uses the following passage:
-```
-Artificial intelligence has transformed the way we live and work, bringing both opportunities and challenges.
-From autonomous vehicles to language models that can engage in human-like conversation, AI technologies are becoming increasingly
-sophisticated. However, with this advancement comes the responsibility to ensure these systems are developed and deployed ethically,
-with careful consideration for privacy, fairness, and transparency. The future of AI will likely depend on how well we balance innovation
-with these important social considerations.
-```
 """
 EVALUATION_QUEUE_TEXT = """
-## Testing Models
-This leaderboard focuses on comparing specific models:
 1. **Vicuna 7B v1.5** - Chat-optimized variant of LLaMA
 2. **IBM Granite 7B Base** - IBM's foundational language model
 3. **LLeMa 7B** - EleutherAI's mathematical language model
-Use the "Test Model" tab to run perplexity evaluation on any of these models.
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"

 # Select your tasks here
 # ---------------------------------------------------
 class Tasks(Enum):
+    # No tasks - we only care about p-values
+    pass
+NUM_FEWSHOT = 0 # Not used
 # ---------------------------------------------------
 # Your leaderboard name
 - `ibm-granite/granite-7b-base` - IBM Granite 7B Base
 - `EleutherAI/llemma_7b` - LLeMa 7B
+**Metric:**
 - **Match P-Value**: Lower p-values indicate the model preserves structural similarity to Llama-2-7B after fine-tuning (neuron organization is maintained).
 """
 LLM_BENCHMARKS_TEXT = """
 ## How it works
+The evaluation runs model tracing analysis on the supported language models:
 ### Supported Models
 - **Vicuna 7B v1.5** (`lmsys/vicuna-7b-v1.5`) - Chat-optimized LLaMA variant
 - **IBM Granite 7B** (`ibm-granite/granite-7b-base`) - IBM's foundational language model
 - **LLeMa 7B** (`EleutherAI/llemma_7b`) - EleutherAI's mathematical language model
+### Model Tracing Analysis
 Compares each model's internal structure to Llama-2-7B using the "match" statistic:
 - **Base Model**: Llama-2-7B (`meta-llama/Llama-2-7b-hf`)
 - **Comparison Models**: The 3 supported models listed above
 The match statistic tests whether neurons in corresponding layers maintain similar functional roles
 between the base model and the comparison models.
 """
 EVALUATION_QUEUE_TEXT = """
+## Model Analysis
+This leaderboard analyzes structural similarity between specific models and Llama-2-7B:
 1. **Vicuna 7B v1.5** - Chat-optimized variant of LLaMA
 2. **IBM Granite 7B Base** - IBM's foundational language model
 3. **LLeMa 7B** - EleutherAI's mathematical language model
+The p-values are computed automatically using the model tracing analysis.
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"

src/display/utils.py CHANGED Viewed

@@ -26,15 +26,7 @@ auto_eval_column_dict = []
 # Init
 auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
-#Scores
-auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
-for task in Tasks:
-    # Use exact column name from Tasks
-    task_col_name = task.value.col_name
-    sys.stderr.write(f"Adding task column: {task.name} -> column name: {task_col_name}\n")
-    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task_col_name, "number", True)])
-    sys.stderr.flush()
-# Model tracing p-value column
 auto_eval_column_dict.append(["model_trace_p_value", ColumnContent, ColumnContent("Match P-Value ⬇️", "number", True)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
@@ -122,7 +114,7 @@ sys.stderr.write(f"COLS: {COLS}\n")
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
-BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 sys.stderr.write(f"BENCHMARK_COLS: {BENCHMARK_COLS}\n")
 sys.stderr.write(f"=== END COLUMN SETUP ===\n")
 sys.stderr.flush()

 # Init
 auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
+# Only p-value column - no other scores
 auto_eval_column_dict.append(["model_trace_p_value", ColumnContent, ColumnContent("Match P-Value ⬇️", "number", True)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
+BENCHMARK_COLS = []  # No benchmark columns - only p-values
 sys.stderr.write(f"BENCHMARK_COLS: {BENCHMARK_COLS}\n")
 sys.stderr.write(f"=== END COLUMN SETUP ===\n")
 sys.stderr.flush()

src/evaluation/initialize_models.py CHANGED Viewed

@@ -53,9 +53,7 @@ def create_model_result_file(model_name, precision="float16"):
             "model_sha": "main"
         },
         "results": {
-            "perplexity": {
-                "perplexity": None  # Will be populated when user tests
-            }
         }
     }

             "model_sha": "main"
         },
         "results": {
+            # No perplexity - we only care about p-values
         }
     }

src/leaderboard/read_evals.py CHANGED Viewed

@@ -59,10 +59,8 @@ class EvalResult:
             if architectures:
                 architecture = ";".join(architectures)
-        # Extract perplexity result
         results = {}
-        if "perplexity" in data["results"]:
-            results["perplexity"] = data["results"]["perplexity"]["perplexity"]
         return self(
             eval_name=result_key,
@@ -88,29 +86,9 @@ class EvalResult:
         sys.stderr.write(f"Weight type: {self.weight_type}\n")
         sys.stderr.flush()
-        # Calculate average, handling perplexity (lower is better)
-        scores = []
-        perplexity_score = None
-        sys.stderr.write(f"Available tasks: {[task.name for task in Tasks]}\n")
-        for task in Tasks:
-            sys.stderr.write(f"Looking for task: {task.value.benchmark} in results\n")
-            if task.value.benchmark in self.results:
-                score = self.results[task.value.benchmark]
-                perplexity_score = score  # Save the raw score
-                sys.stderr.write(f"Found score for {task.value.benchmark}: {score}\n")
-                # Convert perplexity to a 0-100 scale where lower perplexity = higher score
-                # Using a log scale since perplexity can vary widely
-                # Cap at 100 for very low perplexity and 0 for very high perplexity
-                score = max(0, min(100, 100 * (1 - math.log(score) / 10)))
-                scores.append(score)
-                sys.stderr.write(f"Converted score: {score}\n")
-            else:
-                sys.stderr.write(f"Task {task.value.benchmark} not found in results\n")
-            sys.stderr.flush()
-        average = sum(scores) / len(scores) if scores else 0
-        sys.stderr.write(f"Calculated average score: {average}\n")
         sys.stderr.flush()
         # Create data dictionary with comprehensive debugging
@@ -164,17 +142,9 @@ class EvalResult:
         sys.stderr.write(f"Created base data_dict with {len(data_dict)} columns\n")
         sys.stderr.flush()
-        # Add task-specific scores
-        for task in Tasks:
-            task_col_name = task.value.col_name
-            if task.value.benchmark in self.results:
-                task_score = self.results[task.value.benchmark]
-                data_dict[task_col_name] = task_score
-                sys.stderr.write(f"Added task score: {task_col_name} = {task_score}\n")
-            else:
-                data_dict[task_col_name] = None
-                sys.stderr.write(f"Added None for missing task: {task_col_name}\n")
-            sys.stderr.flush()
         sys.stderr.write(f"Final data dict has {len(data_dict)} columns: {list(data_dict.keys())}\n")
         sys.stderr.write(f"=== END PROCESSING RESULT TO_DICT ===\n")

             if architectures:
                 architecture = ";".join(architectures)
+        # No perplexity extraction - we only care about p-values
         results = {}
         return self(
             eval_name=result_key,
         sys.stderr.write(f"Weight type: {self.weight_type}\n")
         sys.stderr.flush()
+        # No task-based scoring - we only care about p-values
+        average = 0  # Default average since we don't have tasks
+        sys.stderr.write(f"No task-based scoring, using default average: {average}\n")
         sys.stderr.flush()
         # Create data dictionary with comprehensive debugging
         sys.stderr.write(f"Created base data_dict with {len(data_dict)} columns\n")
         sys.stderr.flush()
+        # No task-specific scores - we only have p-values
+        sys.stderr.write("No task-specific scores to add\n")
+        sys.stderr.flush()
         sys.stderr.write(f"Final data dict has {len(data_dict)} columns: {list(data_dict.keys())}\n")
         sys.stderr.write(f"=== END PROCESSING RESULT TO_DICT ===\n")