File size: 5,430 Bytes
8fa9808
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2ff25b4
8fa9808
 
2ff25b4
8fa9808
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2ff25b4
8fa9808
 
 
 
 
 
 
 
 
2ff25b4
8fa9808
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import torch
import evaluate
import re
import base64
import io
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer
import spaces  # Assuming this is a custom or predefined library for GPU handling

# ---------------------------------------------------------------------------
# 1. Simple Test Dataset to Run GPU Calls On
# ---------------------------------------------------------------------------
test_data = [
    {"question": "What is 2+2?", "answer": "4"},
    {"question": "What is 3*3?", "answer": "9"},
    {"question": "What is 10/2?", "answer": "5"},
]

# ---------------------------------------------------------------------------
# 2. Load metric
# ---------------------------------------------------------------------------
accuracy_metric = evaluate.load("accuracy")

# ---------------------------------------------------------------------------
# 4. Inference helper functions
# ---------------------------------------------------------------------------
@spaces.GPU
def generate_answer(question, model, tokenizer):
    """
    Generates an answer using Mistral's instruction format.
    """    
    # Mistral instruction format
    prompt = f"""<s>[INST] {question}. Provide only the numerical answer. [/INST]"""
    
    inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Remove the original question from the output
    return text_output.replace(question, "").strip()

def parse_answer(model_output):
    """
    Extract numeric answer from model's text output.
    """
    # Look for numbers (including decimals)
    match = re.search(r"(-?\d*\.?\d+)", model_output)
    if match:
        return match.group(1)
    return model_output.strip()


@spaces.GPU(duration=120)  # Allow up to 2 minutes for full evaluation
def evaluate_toy_dataset(model, tokenizer):
    predictions = []
    references = []
    raw_outputs = []  # Store full model outputs for display
    
    for sample in test_data:
        question = sample["question"]
        reference_answer = sample["answer"]
        
        # Model inference
        model_output = generate_answer(question, model, tokenizer)
        predicted_answer = parse_answer(model_output)
        
        predictions.append(predicted_answer)
        references.append(reference_answer)
        raw_outputs.append({
            "question": question,
            "model_output": model_output,
            "parsed_answer": predicted_answer,
            "reference": reference_answer
        })
    
    # Normalize answers
    def normalize_answer(ans):
        return str(ans).lower().strip()
    
    norm_preds = [normalize_answer(p) for p in predictions]
    norm_refs = [normalize_answer(r) for r in references]
    
    # Compute accuracy
    results = accuracy_metric.compute(predictions=norm_preds, references=norm_refs)
    accuracy = results["accuracy"]
    
    # Create visualization
    fig, ax = plt.subplots(figsize=(8, 6))
    correct_count = sum(p == r for p, r in zip(norm_preds, norm_refs))
    incorrect_count = len(test_data) - correct_count
    
    bars = ax.bar(["Correct", "Incorrect"], 
                 [correct_count, incorrect_count],
                 color=["#2ecc71", "#e74c3c"])
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height)}',
                ha='center', va='bottom')
    
    ax.set_title("Evaluation Results")
    ax.set_ylabel("Count")
    ax.set_ylim([0, len(test_data) + 0.5])
    
    # Convert plot to base64
    buf = io.BytesIO()
    plt.savefig(buf, format="png", bbox_inches='tight', dpi=300)
    buf.seek(0)
    plt.close(fig)
    data = base64.b64encode(buf.read()).decode("utf-8")
    
    # Create detailed results HTML
    details_html = """
    <div style="margin-top: 20px;">
        <h3>Detailed Results:</h3>
        <table style="width:100%; border-collapse: collapse;">
            <tr style="background-color: #f5f5f5;">
                <th style="padding: 8px; border: 1px solid #ddd;">Question</th>
                <th style="padding: 8px; border: 1px solid #ddd;">Model Output</th>
                <th style="padding: 8px; border: 1px solid #ddd;">Parsed Answer</th>
                <th style="padding: 8px; border: 1px solid #ddd;">Reference</th>
            </tr>
    """
    
    for result in raw_outputs:
        details_html += f"""
            <tr>
                <td style="padding: 8px; border: 1px solid #ddd;">{result['question']}</td>
                <td style="padding: 8px; border: 1px solid #ddd;">{result['model_output']}</td>
                <td style="padding: 8px; border: 1px solid #ddd;">{result['parsed_answer']}</td>
                <td style="padding: 8px; border: 1px solid #ddd;">{result['reference']}</td>
            </tr>
        """
    
    details_html += "</table></div>"
    
    full_html = f"""
    <div>
        <img src="data:image/png;base64,{data}" style="width:100%; max-width:600px;">
        {details_html}
    </div>
    """
    
    return f"Accuracy: {accuracy:.2f}", full_html