File size: 3,097 Bytes
097a740 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
# 11.2_evaluate_multilabel.py
import os
import json
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from datasets import load_from_disk
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer
from torch.utils.data import default_collate
from sklearn.metrics import classification_report, multilabel_confusion_matrix
# === Ścieżki
MODEL_DIR = Path("models/multilabel/")
DATASET_DIR = Path("data/processed/dataset_multilabel_top30")
TOP_RULES_PATH = Path("data/metadata/top_rules.json")
OUT_DIR = MODEL_DIR
REPORT_CSV = OUT_DIR / "classification_report.csv"
REPORT_JSON = OUT_DIR / "metrics.json"
CONF_MATRIX_PNG = OUT_DIR / "confusion_matrix_multilabel.png"
# === Data collator dla float32 labels
def collate_fn(batch):
batch = default_collate(batch)
batch["labels"] = batch["labels"].float()
return batch
# === Wczytanie top_rules
with open(TOP_RULES_PATH) as f:
top_rules = json.load(f)
# === Wczytaj model + tokenizer
print("📂 Wczytywanie modelu...")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
try:
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
except:
print("⚠️ Brak tokenizera w modelu — pobieram z microsoft/codebert-base")
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
tokenizer.save_pretrained(MODEL_DIR)
# === Wczytaj dane i stwórz Trainer
dataset = load_from_disk(str(DATASET_DIR))
trainer = Trainer(model=model, data_collator=collate_fn)
# === Predykcja
print("🔍 Predykcja na zbiorze testowym...")
predictions = trainer.predict(dataset["test"].with_format("torch"))
probs = torch.sigmoid(torch.tensor(predictions.predictions)).numpy()
y_pred = (probs > 0.5).astype(int)
y_true = predictions.label_ids
# === Raport klasyfikacji
print("📊 Raport klasyfikacji:")
report_dict = classification_report(
y_true,
y_pred,
target_names=top_rules,
zero_division=0,
output_dict=True
)
report_text = classification_report(y_true, y_pred, target_names=top_rules, zero_division=0)
print(report_text)
# === Zapis raportów
pd.DataFrame(report_dict).transpose().to_csv(REPORT_CSV)
with open(REPORT_JSON, "w") as f:
json.dump(report_dict, f, indent=2)
print(f"💾 Zapisano raport CSV: {REPORT_CSV}")
print(f"💾 Zapisano metryki JSON: {REPORT_JSON}")
# === Macierz błędów (sumaryczna)
print("🧱 Generuję multilabel confusion matrix...")
mcm = multilabel_confusion_matrix(y_true, y_pred)
support = y_true.sum(axis=0).astype(int)
fig, ax = plt.subplots(figsize=(12, 8))
bars = plt.barh(range(len(top_rules)), support)
plt.yticks(range(len(top_rules)), top_rules)
plt.xlabel("Liczba wystąpień w zbiorze testowym")
plt.title("🔢 Rozkład występowania reguł w testowym zbiorze")
for i, bar in enumerate(bars):
width = bar.get_width()
plt.text(width + 1, bar.get_y() + bar.get_height() / 2, str(support[i]), va='center')
plt.tight_layout()
plt.savefig(CONF_MATRIX_PNG)
plt.close()
print(f"🖼️ Zapisano confusion matrix jako PNG: {CONF_MATRIX_PNG}")
|