H2H-eval-comparator

Sleeping

App Files Files Community

rohansampath commited on Feb 18

Commit

0e843f9

verified ·

1 Parent(s): 1856ad2

Delete toy_dataset_eval.py

Browse files

Files changed (1) hide show

toy_dataset_eval.py +0 -151

toy_dataset_eval.py DELETED Viewed

@@ -1,151 +0,0 @@
-import torch
-import evaluate
-import re
-import base64
-import io
-import matplotlib.pyplot as plt
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import spaces  # Assuming this is a custom or predefined library for GPU handling
-# ---------------------------------------------------------------------------
-# 1. Simple Test Dataset to Run GPU Calls On
-# ---------------------------------------------------------------------------
-test_data = [
-    {"question": "What is 2+2?", "answer": "4"},
-    {"question": "What is 3*3?", "answer": "9"},
-    {"question": "What is 10/2?", "answer": "5"},
-]
-# ---------------------------------------------------------------------------
-# 2. Load metric
-# ---------------------------------------------------------------------------
-accuracy_metric = evaluate.load("accuracy")
-# ---------------------------------------------------------------------------
-# 4. Inference helper functions
-# ---------------------------------------------------------------------------
-@spaces.GPU
-def generate_answer(question, model, tokenizer):
-    """
-    Generates an answer using Mistral's instruction format.
-    """
-    # Mistral instruction format
-    prompt = f"""<s>[INST] {question}. Provide only the numerical answer. [/INST]"""
-    inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
-    with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=50,
-            pad_token_id=tokenizer.pad_token_id,
-            eos_token_id=tokenizer.eos_token_id
-        )
-    text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Remove the original question from the output
-    return text_output.replace(question, "").strip()
-def parse_answer(model_output):
-    """
-    Extract numeric answer from model's text output.
-    """
-    # Look for numbers (including decimals)
-    match = re.search(r"(-?\d*\.?\d+)", model_output)
-    if match:
-        return match.group(1)
-    return model_output.strip()
-@spaces.GPU(duration=120)  # Allow up to 2 minutes for full evaluation
-def evaluate_toy_dataset(model, tokenizer):
-    predictions = []
-    references = []
-    raw_outputs = []  # Store full model outputs for display
-    for sample in test_data:
-        question = sample["question"]
-        reference_answer = sample["answer"]
-        # Model inference
-        model_output = generate_answer(question, model, tokenizer)
-        predicted_answer = parse_answer(model_output)
-        predictions.append(predicted_answer)
-        references.append(reference_answer)
-        raw_outputs.append({
-            "question": question,
-            "model_output": model_output,
-            "parsed_answer": predicted_answer,
-            "reference": reference_answer
-        })
-    # Normalize answers
-    def normalize_answer(ans):
-        return str(ans).lower().strip()
-    norm_preds = [normalize_answer(p) for p in predictions]
-    norm_refs = [normalize_answer(r) for r in references]
-    # Compute accuracy
-    results = accuracy_metric.compute(predictions=norm_preds, references=norm_refs)
-    accuracy = results["accuracy"]
-    # Create visualization
-    fig, ax = plt.subplots(figsize=(8, 6))
-    correct_count = sum(p == r for p, r in zip(norm_preds, norm_refs))
-    incorrect_count = len(test_data) - correct_count
-    bars = ax.bar(["Correct", "Incorrect"],
-                 [correct_count, incorrect_count],
-                 color=["#2ecc71", "#e74c3c"])
-    # Add value labels on bars
-    for bar in bars:
-        height = bar.get_height()
-        ax.text(bar.get_x() + bar.get_width()/2., height,
-                f'{int(height)}',
-                ha='center', va='bottom')
-    ax.set_title("Evaluation Results")
-    ax.set_ylabel("Count")
-    ax.set_ylim([0, len(test_data) + 0.5])
-    # Convert plot to base64
-    buf = io.BytesIO()
-    plt.savefig(buf, format="png", bbox_inches='tight', dpi=300)
-    buf.seek(0)
-    plt.close(fig)
-    data = base64.b64encode(buf.read()).decode("utf-8")
-    # Create detailed results HTML
-    details_html = """
-    <div style="margin-top: 20px;">
-        <h3>Detailed Results:</h3>
-        <table style="width:100%; border-collapse: collapse;">
-            <tr style="background-color: #f5f5f5;">
-                <th style="padding: 8px; border: 1px solid #ddd;">Question</th>
-                <th style="padding: 8px; border: 1px solid #ddd;">Model Output</th>
-                <th style="padding: 8px; border: 1px solid #ddd;">Parsed Answer</th>
-                <th style="padding: 8px; border: 1px solid #ddd;">Reference</th>
-            </tr>
-    """
-    for result in raw_outputs:
-        details_html += f"""
-            <tr>
-                <td style="padding: 8px; border: 1px solid #ddd;">{result['question']}</td>
-                <td style="padding: 8px; border: 1px solid #ddd;">{result['model_output']}</td>
-                <td style="padding: 8px; border: 1px solid #ddd;">{result['parsed_answer']}</td>
-                <td style="padding: 8px; border: 1px solid #ddd;">{result['reference']}</td>
-            </tr>
-        """
-    details_html += "</table></div>"
-    full_html = f"""
-    <div>
-        <img src="data:image/png;base64,{data}" style="width:100%; max-width:600px;">
-        {details_html}
-    </div>
-    """
-    return f"Accuracy: {accuracy:.2f}", full_html