File size: 4,490 Bytes
e75fa15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import torch
import argparse
import json
import os
from transformers import AutoModelForCausalLM, PreTrainedTokenizerFast
from datasets import Dataset, DatasetDict

# Paths (adjust as needed)
MODEL_DIR = "../base_model"  # Directory with config.json and .safetensors
TOKENIZER_JSON = "../tokenizer.json"
DATASET_DIR = "../datasets/"

# Load configuration (assuming it’s your earlier Mistral or generation config)
with open("../config.json", "r") as f:
    config = json.load(f)

def load_model():
    """Load the model and tokenizer with optimizations."""
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    
    try:
        tokenizer = PreTrainedTokenizerFast(tokenizer_file=TOKENIZER_JSON)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_DIR,
            torch_dtype=torch.bfloat16,  # From your training
            device_map="auto",           # Auto-distribute
            low_cpu_mem_usage=True
        ).to(device)
        return model, tokenizer
    except Exception as e:
        print(f"Error loading model/tokenizer: {e}")
        exit(1)

def load_custom_dataset(version):
    """Load Eclipse Corpuz dataset based on version."""
    dataset_path = f"{DATASET_DIR}eclipse_corpuz_{version}.json"
    if not os.path.exists(dataset_path):
        print(f"Error: Dataset {dataset_path} not found")
        exit(1)
    
    try:
        with open(dataset_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        
        # Handle flexible formats
        if isinstance(data, list):
            # If list of dicts with "text" key
            if data and isinstance(data[0], dict) and "text" in data[0]:
                dataset = Dataset.from_list(data)
            # If list of strings
            else:
                dataset = Dataset.from_dict({"text": data})
        else:
            print(f"Error: Unsupported dataset format in {dataset_path}")
            exit(1)
        
        return DatasetDict({"test": dataset})
    except Exception as e:
        print(f"Error loading dataset: {e}")
        exit(1)

def evaluate(model, tokenizer, dataset, batch_size=8):
    """Evaluate model on Eclipse Corpuz dataset with batching."""
    dataset = dataset["test"]
    model.eval()
    losses = []
    total_tokens = 0
    correct_tokens = 0

    # Batch processing
    for i in range(0, min(len(dataset), 100), batch_size):  # Limit to 100 samples
        batch = dataset[i:i + batch_size]
        inputs = tokenizer(
            batch["text"],
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=config.get("max_length", 512)  # From config or default
        ).to(model.device)

        labels = inputs["input_ids"].clone()
        
        with torch.no_grad():
            outputs = model(**inputs, labels=labels)
            losses.append(outputs.loss.item())

            # Shift logits/labels for next-token prediction accuracy
            shift_logits = outputs.logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            predictions = torch.argmax(shift_logits, dim=-1)
            
            mask = shift_labels != tokenizer.pad_token_id  # Ignore padding
            correct_tokens += (predictions == shift_labels).masked_select(mask).sum().item()
            total_tokens += mask.sum().item()

    avg_loss = sum(losses) / len(losses) if losses else float("inf")
    perplexity = torch.exp(torch.tensor(avg_loss)).item()
    accuracy = correct_tokens / total_tokens if total_tokens > 0 else 0

    return {"accuracy": accuracy, "loss": avg_loss, "perplexity": perplexity}

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Evaluate Charm 15 on Eclipse Corpuz dataset")
    parser.add_argument("--version", type=str, default="1.1", help="Dataset version (e.g., 1.1, 1.2)")
    args = parser.parse_args()

    model, tokenizer = load_model()
    dataset = load_custom_dataset(args.version)
    results = evaluate(model, tokenizer, dataset, batch_size=4)  # Lowered for memory

    print(f"Evaluation Results (Eclipse Corpuz {args.version}):")
    print(f"Accuracy: {results['accuracy']:.4f}")
    print(f"Loss: {results['loss']:.4f}")
    print(f"Perplexity: {results['perplexity']:.4f}")

    # Cleanup
    del model
    torch.cuda.empty_cache()