Spaces:
Sleeping
Sleeping
File size: 5,430 Bytes
8fa9808 2ff25b4 8fa9808 2ff25b4 8fa9808 2ff25b4 8fa9808 2ff25b4 8fa9808 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import torch
import evaluate
import re
import base64
import io
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer
import spaces # Assuming this is a custom or predefined library for GPU handling
# ---------------------------------------------------------------------------
# 1. Simple Test Dataset to Run GPU Calls On
# ---------------------------------------------------------------------------
test_data = [
{"question": "What is 2+2?", "answer": "4"},
{"question": "What is 3*3?", "answer": "9"},
{"question": "What is 10/2?", "answer": "5"},
]
# ---------------------------------------------------------------------------
# 2. Load metric
# ---------------------------------------------------------------------------
accuracy_metric = evaluate.load("accuracy")
# ---------------------------------------------------------------------------
# 4. Inference helper functions
# ---------------------------------------------------------------------------
@spaces.GPU
def generate_answer(question, model, tokenizer):
"""
Generates an answer using Mistral's instruction format.
"""
# Mistral instruction format
prompt = f"""<s>[INST] {question}. Provide only the numerical answer. [/INST]"""
inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=50,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id
)
text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Remove the original question from the output
return text_output.replace(question, "").strip()
def parse_answer(model_output):
"""
Extract numeric answer from model's text output.
"""
# Look for numbers (including decimals)
match = re.search(r"(-?\d*\.?\d+)", model_output)
if match:
return match.group(1)
return model_output.strip()
@spaces.GPU(duration=120) # Allow up to 2 minutes for full evaluation
def evaluate_toy_dataset(model, tokenizer):
predictions = []
references = []
raw_outputs = [] # Store full model outputs for display
for sample in test_data:
question = sample["question"]
reference_answer = sample["answer"]
# Model inference
model_output = generate_answer(question, model, tokenizer)
predicted_answer = parse_answer(model_output)
predictions.append(predicted_answer)
references.append(reference_answer)
raw_outputs.append({
"question": question,
"model_output": model_output,
"parsed_answer": predicted_answer,
"reference": reference_answer
})
# Normalize answers
def normalize_answer(ans):
return str(ans).lower().strip()
norm_preds = [normalize_answer(p) for p in predictions]
norm_refs = [normalize_answer(r) for r in references]
# Compute accuracy
results = accuracy_metric.compute(predictions=norm_preds, references=norm_refs)
accuracy = results["accuracy"]
# Create visualization
fig, ax = plt.subplots(figsize=(8, 6))
correct_count = sum(p == r for p, r in zip(norm_preds, norm_refs))
incorrect_count = len(test_data) - correct_count
bars = ax.bar(["Correct", "Incorrect"],
[correct_count, incorrect_count],
color=["#2ecc71", "#e74c3c"])
# Add value labels on bars
for bar in bars:
height = bar.get_height()
ax.text(bar.get_x() + bar.get_width()/2., height,
f'{int(height)}',
ha='center', va='bottom')
ax.set_title("Evaluation Results")
ax.set_ylabel("Count")
ax.set_ylim([0, len(test_data) + 0.5])
# Convert plot to base64
buf = io.BytesIO()
plt.savefig(buf, format="png", bbox_inches='tight', dpi=300)
buf.seek(0)
plt.close(fig)
data = base64.b64encode(buf.read()).decode("utf-8")
# Create detailed results HTML
details_html = """
<div style="margin-top: 20px;">
<h3>Detailed Results:</h3>
<table style="width:100%; border-collapse: collapse;">
<tr style="background-color: #f5f5f5;">
<th style="padding: 8px; border: 1px solid #ddd;">Question</th>
<th style="padding: 8px; border: 1px solid #ddd;">Model Output</th>
<th style="padding: 8px; border: 1px solid #ddd;">Parsed Answer</th>
<th style="padding: 8px; border: 1px solid #ddd;">Reference</th>
</tr>
"""
for result in raw_outputs:
details_html += f"""
<tr>
<td style="padding: 8px; border: 1px solid #ddd;">{result['question']}</td>
<td style="padding: 8px; border: 1px solid #ddd;">{result['model_output']}</td>
<td style="padding: 8px; border: 1px solid #ddd;">{result['parsed_answer']}</td>
<td style="padding: 8px; border: 1px solid #ddd;">{result['reference']}</td>
</tr>
"""
details_html += "</table></div>"
full_html = f"""
<div>
<img src="data:image/png;base64,{data}" style="width:100%; max-width:600px;">
{details_html}
</div>
"""
return f"Accuracy: {accuracy:.2f}", full_html
|