H2H-eval-comparator

Sleeping

App Files Files Community

rohansampath commited on Feb 13

Commit

a7f824f

verified ·

1 Parent(s): 2ff25b4

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -177

app.py CHANGED Viewed

@@ -1,16 +1,9 @@
 import gradio as gr
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
-import evaluate
-import re
-import matplotlib
-matplotlib.use('Agg')
-import matplotlib.pyplot as plt
-import io
-import base64
 import os
 from huggingface_hub import login
-import spaces
 from mmlu_eval import evaluate_mmlu
 # Read token and login
@@ -21,177 +14,58 @@ else:
     print("⚠️ No HF_TOKEN_READ_WRITE found in environment")
 # ---------------------------------------------------------------------------
-# 1. Model and tokenizer setup
 # ---------------------------------------------------------------------------
 model_name = "mistralai/Mistral-7B-Instruct-v0.3"
 tokenizer = None
 model = None
 @spaces.GPU
 def load_model():
-    global tokenizer, model
-    if tokenizer is None:
-        tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
-    if model is None:
-        model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            token=hf_token,
-            torch_dtype=torch.float16
-        )
-        model.to('cuda')
-    return model, tokenizer
 # ---------------------------------------------------------------------------
-# 2. Test dataset
 # ---------------------------------------------------------------------------
-test_data = [
-    {"question": "What is 2+2?", "answer": "4"},
-    {"question": "What is 3*3?", "answer": "9"},
-    {"question": "What is 10/2?", "answer": "5"},
-]
-# ---------------------------------------------------------------------------
-# 3. Load metric
-# ---------------------------------------------------------------------------
-accuracy_metric = evaluate.load("accuracy")
-# ---------------------------------------------------------------------------
-# 4. Inference helper functions
-# ---------------------------------------------------------------------------
-@spaces.GPU
-def generate_answer(question):
-    """
-    Generates an answer using Mistral's instruction format.
-    """
-    model, tokenizer = load_model()
-    # Mistral instruction format
-    prompt = f"""<s>[INST] {question}. Provide only the numerical answer. [/INST]"""
-    inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
-    with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=50,
-            pad_token_id=tokenizer.pad_token_id,
-            eos_token_id=tokenizer.eos_token_id
-        )
-    text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Remove the original question from the output
-    return text_output.replace(question, "").strip()
-def parse_answer(model_output):
-    """
-    Extract numeric answer from model's text output.
-    """
-    # Look for numbers (including decimals)
-    match = re.search(r"(-?\d*\.?\d+)", model_output)
-    if match:
-        return match.group(1)
-    return model_output.strip()
 # ---------------------------------------------------------------------------
-# 5. Evaluation routine
 # ---------------------------------------------------------------------------
 @spaces.GPU(duration=120)  # Allow up to 2 minutes for full evaluation
-def run_evaluation():
-    predictions = []
-    references = []
-    raw_outputs = []  # Store full model outputs for display
-    for sample in test_data:
-        question = sample["question"]
-        reference_answer = sample["answer"]
-        # Model inference
-        model_output = generate_answer(question)
-        predicted_answer = parse_answer(model_output)
-        predictions.append(predicted_answer)
-        references.append(reference_answer)
-        raw_outputs.append({
-            "question": question,
-            "model_output": model_output,
-            "parsed_answer": predicted_answer,
-            "reference": reference_answer
-        })
-    # Normalize answers
-    def normalize_answer(ans):
-        return str(ans).lower().strip()
-    norm_preds = [normalize_answer(p) for p in predictions]
-    norm_refs = [normalize_answer(r) for r in references]
-    # Compute accuracy
-    results = accuracy_metric.compute(predictions=norm_preds, references=norm_refs)
-    accuracy = results["accuracy"]
-    # Create visualization
-    fig, ax = plt.subplots(figsize=(8, 6))
-    correct_count = sum(p == r for p, r in zip(norm_preds, norm_refs))
-    incorrect_count = len(test_data) - correct_count
-    bars = ax.bar(["Correct", "Incorrect"],
-                 [correct_count, incorrect_count],
-                 color=["#2ecc71", "#e74c3c"])
-    # Add value labels on bars
-    for bar in bars:
-        height = bar.get_height()
-        ax.text(bar.get_x() + bar.get_width()/2., height,
-                f'{int(height)}',
-                ha='center', va='bottom')
-    ax.set_title("Evaluation Results")
-    ax.set_ylabel("Count")
-    ax.set_ylim([0, len(test_data) + 0.5])
-    # Convert plot to base64
-    buf = io.BytesIO()
-    plt.savefig(buf, format="png", bbox_inches='tight', dpi=300)
-    buf.seek(0)
-    plt.close(fig)
-    data = base64.b64encode(buf.read()).decode("utf-8")
-    # Create detailed results HTML
-    details_html = """
-    <div style="margin-top: 20px;">
-        <h3>Detailed Results:</h3>
-        <table style="width:100%; border-collapse: collapse;">
-            <tr style="background-color: #f5f5f5;">
-                <th style="padding: 8px; border: 1px solid #ddd;">Question</th>
-                <th style="padding: 8px; border: 1px solid #ddd;">Model Output</th>
-                <th style="padding: 8px; border: 1px solid #ddd;">Parsed Answer</th>
-                <th style="padding: 8px; border: 1px solid #ddd;">Reference</th>
-            </tr>
-    """
-    for result in raw_outputs:
-        details_html += f"""
-            <tr>
-                <td style="padding: 8px; border: 1px solid #ddd;">{result['question']}</td>
-                <td style="padding: 8px; border: 1px solid #ddd;">{result['model_output']}</td>
-                <td style="padding: 8px; border: 1px solid #ddd;">{result['parsed_answer']}</td>
-                <td style="padding: 8px; border: 1px solid #ddd;">{result['reference']}</td>
-            </tr>
-        """
-    details_html += "</table></div>"
-    full_html = f"""
-    <div>
-        <img src="data:image/png;base64,{data}" style="width:100%; max-width:600px;">
-        {details_html}
-    </div>
-    """
-    return f"Accuracy: {accuracy:.2f}", full_html
-# ---------------------------------------------------------------------------
-# 5. MMLU Evaluation call
-# ---------------------------------------------------------------------------
 def run_mmlu_evaluation(num_questions):
     """
     Runs the MMLU evaluation with the specified number of questions per task.
     Also displays two correct and two incorrect examples.
@@ -224,32 +98,33 @@ def run_mmlu_evaluation(num_questions):
     return report
 # ---------------------------------------------------------------------------
-# 6. Gradio Interface
 # ---------------------------------------------------------------------------
 with gr.Blocks() as demo:
     gr.Markdown("# Mistral-7B Math Evaluation Demo")
     gr.Markdown("""
-    This demo evaluates Mistral-7B on three very simple math problems to get started.
-    Press the button below to run the evaluation.
     """)
     eval_button = gr.Button("Run Evaluation", variant="primary")
     output_text = gr.Textbox(label="Results")
     output_plot = gr.HTML(label="Visualization and Details")
-    eval_button.click(
-        fn=run_evaluation,
-        inputs=None,
-        outputs=[output_text, output_plot]
-    )
     gr.Markdown("### MMLU Evaluation")
-    num_questions_input = gr.Number(label="Questions per Task (there are 57 total Tasks)", value=5, precision=0)
-    eval_mmlu_button = gr.Button("Run MMLU Evaluation")
     mmlu_output = gr.Textbox(label="MMLU Evaluation Results")
-    eval_mmlu_button.click(fn=run_mmlu_evaluation, inputs=[num_questions_input], outputs=[mmlu_output])
 demo.launch()

 import gradio as gr
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import os
 from huggingface_hub import login
+from toy-dataset-eval import evaluate_toy_dataset
 from mmlu_eval import evaluate_mmlu
 # Read token and login
     print("⚠️ No HF_TOKEN_READ_WRITE found in environment")
 # ---------------------------------------------------------------------------
+# 1. Model and tokenizer setup and Loading
 # ---------------------------------------------------------------------------
 model_name = "mistralai/Mistral-7B-Instruct-v0.3"
 tokenizer = None
 model = None
+model_loaded = False
 @spaces.GPU
 def load_model():
+    """Loads the Mistral model and tokenizer and updates the load status."""
+    global tokenizer, model, model_loaded
+    try:
+        if tokenizer is None:
+            tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
+        if model is None:
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                token=hf_token,
+                torch_dtype=torch.float16
+            )
+            model.to('cuda')
+        model_loaded = True
+        return "✅ Model Loaded!"
+    except Exception as e:
+        model_loaded = False
+        return f"❌ Model Load Failed: {str(e)}"
 # ---------------------------------------------------------------------------
+# 2. Toy Evaluation
 # ---------------------------------------------------------------------------
+@spaces.GPU (duration=120)
+def run_toy_evaluation():
+    """Runs the toy dataset evaluation."""
+    if not model_loaded:
+        load_model()
+    if not model_loaded:
+        return "⚠️ Model not loaded. Please load the model first."
+    results = evaluate_toy_dataset(model, tokenizer)
+    return results  # Ensure load confirmation is shown before results
 # ---------------------------------------------------------------------------
+# 3. MMLU Evaluation call
 # ---------------------------------------------------------------------------
 @spaces.GPU(duration=120)  # Allow up to 2 minutes for full evaluation
 def run_mmlu_evaluation(num_questions):
+    if not model_loaded:
+        load_model()
+    if not model_loaded:
+        return "⚠️ Model not loaded. Please load the model first."
     """
     Runs the MMLU evaluation with the specified number of questions per task.
     Also displays two correct and two incorrect examples.
     return report
 # ---------------------------------------------------------------------------
+# 4. Gradio Interface
 # ---------------------------------------------------------------------------
 with gr.Blocks() as demo:
     gr.Markdown("# Mistral-7B Math Evaluation Demo")
     gr.Markdown("""
+    This demo evaluates Mistral-7B on Various Datasets.
     """)
+    # Load Model Button
+    load_button = gr.Button("Load Model", variant="primary")
+    load_status = gr.Textbox(label="Model Status", interactive=False)
+    load_button.click(fn=load_model, inputs=None, outputs=load_status)
+    # Toy Dataset Evaluation
+    gr.Markdown("### Toy Dataset Evaluation")
     eval_button = gr.Button("Run Evaluation", variant="primary")
     output_text = gr.Textbox(label="Results")
     output_plot = gr.HTML(label="Visualization and Details")
+    eval_button.click(fn=run_toy_evaluation, inputs=None, outputs=[output_text, output_plot])
+    # MMLU Evaluation
     gr.Markdown("### MMLU Evaluation")
+    num_questions_input = gr.Number(label="Questions per Task (Max: 57)", value=5, precision=0)
+    eval_mmlu_button = gr.Button("Run MMLU Evaluation", variant="primary")
     mmlu_output = gr.Textbox(label="MMLU Evaluation Results")
+    eval_mmlu_button.click(fn=run_mmlu_evaluation, inputs=[num_questions_input], outputs=[mmlu_output])
 demo.launch()