File size: 7,186 Bytes
c5224d3
3195f7f
 
 
 
 
614dffd
3195f7f
 
 
4c36941
 
614dffd
4c36941
e8d7a5b
 
 
 
 
 
c5224d3
3195f7f
614dffd
3195f7f
e8d7a5b
614dffd
 
3195f7f
614dffd
 
 
 
 
 
 
 
 
 
 
 
 
3195f7f
 
e8d7a5b
3195f7f
 
 
 
 
 
 
 
e8d7a5b
3195f7f
 
 
 
 
 
614dffd
3195f7f
 
e8d7a5b
3195f7f
614dffd
 
e8d7a5b
7aad8f7
3195f7f
614dffd
3195f7f
 
 
e8d7a5b
3195f7f
e8d7a5b
 
3195f7f
 
e8d7a5b
 
3195f7f
 
 
e8d7a5b
3195f7f
e8d7a5b
 
3195f7f
 
 
 
 
 
 
614dffd
3195f7f
 
 
e8d7a5b
3195f7f
 
 
 
 
 
 
 
 
 
 
e8d7a5b
 
 
 
 
 
3195f7f
e8d7a5b
3195f7f
e8d7a5b
3195f7f
 
e8d7a5b
3195f7f
 
 
 
 
e8d7a5b
614dffd
3195f7f
 
 
e8d7a5b
 
 
 
 
 
 
 
 
 
 
3195f7f
 
614dffd
3195f7f
e8d7a5b
3195f7f
e8d7a5b
3195f7f
 
 
 
e8d7a5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3195f7f
 
e8d7a5b
3195f7f
 
e8d7a5b
 
 
 
 
 
 
3195f7f
e8d7a5b
 
3195f7f
 
 
 
 
c5224d3
e8d7a5b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import evaluate
import re
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import io
import base64
import os
from huggingface_hub import login
import spaces

# Read token and login
hf_token = os.getenv("HF_TOKEN_READ_WRITE")
if hf_token:
    login(hf_token)
else:
    print("⚠️ No HF_TOKEN_READ_WRITE found in environment")

# ---------------------------------------------------------------------------
# 1. Model and tokenizer setup
# ---------------------------------------------------------------------------
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = None
model = None

@spaces.GPU
def load_model():
    global tokenizer, model
    if tokenizer is None:
        tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
    if model is None:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            token=hf_token,
            torch_dtype=torch.float16
        )
        model.to('cuda')
    return model, tokenizer

# ---------------------------------------------------------------------------
# 2. Test dataset
# ---------------------------------------------------------------------------
test_data = [
    {"question": "What is 2+2?", "answer": "4"},
    {"question": "What is 3*3?", "answer": "9"},
    {"question": "What is 10/2?", "answer": "5"},
]

# ---------------------------------------------------------------------------
# 3. Load metric
# ---------------------------------------------------------------------------
accuracy_metric = evaluate.load("accuracy")

# ---------------------------------------------------------------------------
# 4. Inference helper functions
# ---------------------------------------------------------------------------
@spaces.GPU
def generate_answer(question):
    """
    Generates an answer using Mistral's instruction format.
    """
    model, tokenizer = load_model()
    
    # Mistral instruction format
    prompt = f"""<s>[INST] {question}. Provide only the numerical answer. [/INST]"""
    
    inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            temperature=0.0,  # deterministic
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Remove the original question from the output
    return text_output.replace(question, "").strip()

def parse_answer(model_output):
    """
    Extract numeric answer from model's text output.
    """
    # Look for numbers (including decimals)
    match = re.search(r"(-?\d*\.?\d+)", model_output)
    if match:
        return match.group(1)
    return model_output.strip()

# ---------------------------------------------------------------------------
# 5. Evaluation routine
# ---------------------------------------------------------------------------
@spaces.GPU(duration=120)  # Allow up to 2 minutes for full evaluation
def run_evaluation():
    predictions = []
    references = []
    raw_outputs = []  # Store full model outputs for display
    
    for sample in test_data:
        question = sample["question"]
        reference_answer = sample["answer"]
        
        # Model inference
        model_output = generate_answer(question)
        predicted_answer = parse_answer(model_output)
        
        predictions.append(predicted_answer)
        references.append(reference_answer)
        raw_outputs.append({
            "question": question,
            "model_output": model_output,
            "parsed_answer": predicted_answer,
            "reference": reference_answer
        })
    
    # Normalize answers
    def normalize_answer(ans):
        return str(ans).lower().strip()
    
    norm_preds = [normalize_answer(p) for p in predictions]
    norm_refs = [normalize_answer(r) for r in references]
    
    # Compute accuracy
    results = accuracy_metric.compute(predictions=norm_preds, references=norm_refs)
    accuracy = results["accuracy"]
    
    # Create visualization
    fig, ax = plt.subplots(figsize=(8, 6))
    correct_count = sum(p == r for p, r in zip(norm_preds, norm_refs))
    incorrect_count = len(test_data) - correct_count
    
    bars = ax.bar(["Correct", "Incorrect"], 
                 [correct_count, incorrect_count],
                 color=["#2ecc71", "#e74c3c"])
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height)}',
                ha='center', va='bottom')
    
    ax.set_title("Evaluation Results")
    ax.set_ylabel("Count")
    ax.set_ylim([0, len(test_data) + 0.5])
    
    # Convert plot to base64
    buf = io.BytesIO()
    plt.savefig(buf, format="png", bbox_inches='tight', dpi=300)
    buf.seek(0)
    plt.close(fig)
    data = base64.b64encode(buf.read()).decode("utf-8")
    
    # Create detailed results HTML
    details_html = """
    <div style="margin-top: 20px;">
        <h3>Detailed Results:</h3>
        <table style="width:100%; border-collapse: collapse;">
            <tr style="background-color: #f5f5f5;">
                <th style="padding: 8px; border: 1px solid #ddd;">Question</th>
                <th style="padding: 8px; border: 1px solid #ddd;">Model Output</th>
                <th style="padding: 8px; border: 1px solid #ddd;">Parsed Answer</th>
                <th style="padding: 8px; border: 1px solid #ddd;">Reference</th>
            </tr>
    """
    
    for result in raw_outputs:
        details_html += f"""
            <tr>
                <td style="padding: 8px; border: 1px solid #ddd;">{result['question']}</td>
                <td style="padding: 8px; border: 1px solid #ddd;">{result['model_output']}</td>
                <td style="padding: 8px; border: 1px solid #ddd;">{result['parsed_answer']}</td>
                <td style="padding: 8px; border: 1px solid #ddd;">{result['reference']}</td>
            </tr>
        """
    
    details_html += "</table></div>"
    
    full_html = f"""
    <div>
        <img src="data:image/png;base64,{data}" style="width:100%; max-width:600px;">
        {details_html}
    </div>
    """
    
    return f"Accuracy: {accuracy:.2f}", full_html

# ---------------------------------------------------------------------------
# 6. Gradio Interface
# ---------------------------------------------------------------------------
with gr.Blocks() as demo:
    gr.Markdown("# Mistral-7B Math Evaluation Demo")
    gr.Markdown("""
    This demo evaluates Mistral-7B on basic math problems.
    Press the button below to run the evaluation.
    """)
    
    eval_button = gr.Button("Run Evaluation", variant="primary")
    output_text = gr.Textbox(label="Results")
    output_plot = gr.HTML(label="Visualization and Details")
    
    eval_button.click(
        fn=run_evaluation,
        inputs=None,
        outputs=[output_text, output_plot]
    )

demo.launch()