Spaces:
Sleeping
Sleeping
File size: 7,187 Bytes
c5224d3 3195f7f 4c36941 e8d7a5b c5224d3 e8d7a5b 008b5f1 3195f7f e8d7a5b 3195f7f e8d7a5b 008b5f1 e8d7a5b 008b5f1 3195f7f e8d7a5b 3195f7f e8d7a5b 3195f7f e8d7a5b 3195f7f e8d7a5b 3195f7f e8d7a5b 3195f7f e8d7a5b 3195f7f e8d7a5b 3195f7f e8d7a5b 3195f7f e8d7a5b 3195f7f e8d7a5b 3195f7f e8d7a5b 3195f7f e8d7a5b 3195f7f e8d7a5b 3195f7f e8d7a5b 3195f7f e8d7a5b 3195f7f e8d7a5b 3195f7f e8d7a5b 3195f7f e8d7a5b 3195f7f e8d7a5b 3195f7f e8d7a5b 3195f7f e8d7a5b 3195f7f e8d7a5b 3195f7f e8d7a5b 3195f7f c5224d3 e8d7a5b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import evaluate
import re
import matplotlib
matplotlib.use('Agg') # for non-interactive envs
import matplotlib.pyplot as plt
import io
import base64
import os
from huggingface_hub import login
# Read token and login
hf_token = os.getenv("HF_TOKEN_READ_WRITE")
if hf_token:
login(hf_token)
else:
print("⚠️ No HF_TOKEN_READ_WRITE found in environment")
# Check GPU availability
if torch.cuda.is_available():
print("✅ GPU is available")
print("GPU Name:", torch.cuda.get_device_name(0))
else:
print("❌ No GPU available")
# ---------------------------------------------------------------------------
# 1. Define model name and load model/tokenizer
# ---------------------------------------------------------------------------
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(
model_name,
token=hf_token,
torch_dtype=torch.float16,
device_map="auto"
)
print(f"✅ Model loaded on {device}")
# ---------------------------------------------------------------------------
# 2. Test dataset
# ---------------------------------------------------------------------------
test_data = [
{"question": "What is 2+2?", "answer": "4"},
{"question": "What is 3*3?", "answer": "9"},
{"question": "What is 10/2?", "answer": "5"},
]
# ---------------------------------------------------------------------------
# 3. Load metric
# ---------------------------------------------------------------------------
accuracy_metric = evaluate.load("accuracy")
# ---------------------------------------------------------------------------
# 4. Inference helper functions
# ---------------------------------------------------------------------------
def generate_answer(question):
"""
Generates an answer using Mistral's instruction format.
"""
# Mistral instruction format
prompt = f"""<s>[INST] {question} [/INST]"""
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=50,
temperature=0.0, # deterministic
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id
)
text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Remove the original question from the output
return text_output.replace(question, "").strip()
def parse_answer(model_output):
"""
Extract numeric answer from model's text output.
"""
# Look for numbers (including decimals)
match = re.search(r"(-?\d*\.?\d+)", model_output)
if match:
return match.group(1)
return model_output.strip()
# ---------------------------------------------------------------------------
# 5. Evaluation routine
# ---------------------------------------------------------------------------
def run_evaluation():
predictions = []
references = []
raw_outputs = [] # Store full model outputs for display
for sample in test_data:
question = sample["question"]
reference_answer = sample["answer"]
# Model inference
model_output = generate_answer(question)
predicted_answer = parse_answer(model_output)
predictions.append(predicted_answer)
references.append(reference_answer)
raw_outputs.append({
"question": question,
"model_output": model_output,
"parsed_answer": predicted_answer,
"reference": reference_answer
})
# Normalize answers
def normalize_answer(ans):
return str(ans).lower().strip()
norm_preds = [normalize_answer(p) for p in predictions]
norm_refs = [normalize_answer(r) for r in references]
# Compute accuracy
results = accuracy_metric.compute(predictions=norm_preds, references=norm_refs)
accuracy = results["accuracy"]
# Create visualization
correct_count = sum(p == r for p, r in zip(norm_preds, norm_refs))
incorrect_count = len(test_data) - correct_count
fig, ax = plt.subplots(figsize=(8, 6))
bars = ax.bar(["Correct", "Incorrect"],
[correct_count, incorrect_count],
color=["#2ecc71", "#e74c3c"])
# Add value labels on bars
for bar in bars:
height = bar.get_height()
ax.text(bar.get_x() + bar.get_width()/2., height,
f'{int(height)}',
ha='center', va='bottom')
ax.set_title("Evaluation Results")
ax.set_ylabel("Count")
ax.set_ylim([0, len(test_data) + 0.5]) # Add some padding at top
# Convert plot to base64
buf = io.BytesIO()
plt.savefig(buf, format="png", bbox_inches='tight', dpi=300)
buf.seek(0)
plt.close(fig)
data = base64.b64encode(buf.read()).decode("utf-8")
# Create detailed results HTML
details_html = """
<div style="margin-top: 20px;">
<h3>Detailed Results:</h3>
<table style="width:100%; border-collapse: collapse;">
<tr style="background-color: #f5f5f5;">
<th style="padding: 8px; border: 1px solid #ddd;">Question</th>
<th style="padding: 8px; border: 1px solid #ddd;">Model Output</th>
<th style="padding: 8px; border: 1px solid #ddd;">Parsed Answer</th>
<th style="padding: 8px; border: 1px solid #ddd;">Reference</th>
</tr>
"""
for result in raw_outputs:
details_html += f"""
<tr>
<td style="padding: 8px; border: 1px solid #ddd;">{result['question']}</td>
<td style="padding: 8px; border: 1px solid #ddd;">{result['model_output']}</td>
<td style="padding: 8px; border: 1px solid #ddd;">{result['parsed_answer']}</td>
<td style="padding: 8px; border: 1px solid #ddd;">{result['reference']}</td>
</tr>
"""
details_html += "</table></div>"
# Combine plot and details
full_html = f"""
<div>
<img src="data:image/png;base64,{data}" style="width:100%; max-width:600px;">
{details_html}
</div>
"""
return f"Accuracy: {accuracy:.2f}", full_html
# ---------------------------------------------------------------------------
# 6. Gradio Interface
# ---------------------------------------------------------------------------
with gr.Blocks() as demo:
gr.Markdown("# Mistral-7B Math Evaluation Demo")
gr.Markdown("""
This demo evaluates Mistral-7B on basic math problems.
Press the button below to run the evaluation.
""")
eval_button = gr.Button("Run Evaluation", variant="primary")
output_text = gr.Textbox(label="Results")
output_plot = gr.HTML(label="Visualization and Details")
eval_button.click(
fn=run_evaluation,
inputs=None,
outputs=[output_text, output_plot]
)
demo.launch() |