Charm_15 / evaluate.py
GeminiFan207's picture
Create evaluate.py
e75fa15 verified
import torch
import argparse
import json
import os
from transformers import AutoModelForCausalLM, PreTrainedTokenizerFast
from datasets import Dataset, DatasetDict
# Paths (adjust as needed)
MODEL_DIR = "../base_model" # Directory with config.json and .safetensors
TOKENIZER_JSON = "../tokenizer.json"
DATASET_DIR = "../datasets/"
# Load configuration (assuming it’s your earlier Mistral or generation config)
with open("../config.json", "r") as f:
config = json.load(f)
def load_model():
"""Load the model and tokenizer with optimizations."""
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
try:
tokenizer = PreTrainedTokenizerFast(tokenizer_file=TOKENIZER_JSON)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
MODEL_DIR,
torch_dtype=torch.bfloat16, # From your training
device_map="auto", # Auto-distribute
low_cpu_mem_usage=True
).to(device)
return model, tokenizer
except Exception as e:
print(f"Error loading model/tokenizer: {e}")
exit(1)
def load_custom_dataset(version):
"""Load Eclipse Corpuz dataset based on version."""
dataset_path = f"{DATASET_DIR}eclipse_corpuz_{version}.json"
if not os.path.exists(dataset_path):
print(f"Error: Dataset {dataset_path} not found")
exit(1)
try:
with open(dataset_path, "r", encoding="utf-8") as f:
data = json.load(f)
# Handle flexible formats
if isinstance(data, list):
# If list of dicts with "text" key
if data and isinstance(data[0], dict) and "text" in data[0]:
dataset = Dataset.from_list(data)
# If list of strings
else:
dataset = Dataset.from_dict({"text": data})
else:
print(f"Error: Unsupported dataset format in {dataset_path}")
exit(1)
return DatasetDict({"test": dataset})
except Exception as e:
print(f"Error loading dataset: {e}")
exit(1)
def evaluate(model, tokenizer, dataset, batch_size=8):
"""Evaluate model on Eclipse Corpuz dataset with batching."""
dataset = dataset["test"]
model.eval()
losses = []
total_tokens = 0
correct_tokens = 0
# Batch processing
for i in range(0, min(len(dataset), 100), batch_size): # Limit to 100 samples
batch = dataset[i:i + batch_size]
inputs = tokenizer(
batch["text"],
return_tensors="pt",
padding=True,
truncation=True,
max_length=config.get("max_length", 512) # From config or default
).to(model.device)
labels = inputs["input_ids"].clone()
with torch.no_grad():
outputs = model(**inputs, labels=labels)
losses.append(outputs.loss.item())
# Shift logits/labels for next-token prediction accuracy
shift_logits = outputs.logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
predictions = torch.argmax(shift_logits, dim=-1)
mask = shift_labels != tokenizer.pad_token_id # Ignore padding
correct_tokens += (predictions == shift_labels).masked_select(mask).sum().item()
total_tokens += mask.sum().item()
avg_loss = sum(losses) / len(losses) if losses else float("inf")
perplexity = torch.exp(torch.tensor(avg_loss)).item()
accuracy = correct_tokens / total_tokens if total_tokens > 0 else 0
return {"accuracy": accuracy, "loss": avg_loss, "perplexity": perplexity}
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Evaluate Charm 15 on Eclipse Corpuz dataset")
parser.add_argument("--version", type=str, default="1.1", help="Dataset version (e.g., 1.1, 1.2)")
args = parser.parse_args()
model, tokenizer = load_model()
dataset = load_custom_dataset(args.version)
results = evaluate(model, tokenizer, dataset, batch_size=4) # Lowered for memory
print(f"Evaluation Results (Eclipse Corpuz {args.version}):")
print(f"Accuracy: {results['accuracy']:.4f}")
print(f"Loss: {results['loss']:.4f}")
print(f"Perplexity: {results['perplexity']:.4f}")
# Cleanup
del model
torch.cuda.empty_cache()