|
|
|
|
|
import os |
|
import json |
|
import torch |
|
import numpy as np |
|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
|
|
from pathlib import Path |
|
from datasets import load_from_disk |
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer |
|
from torch.utils.data import default_collate |
|
from sklearn.metrics import classification_report, multilabel_confusion_matrix |
|
|
|
|
|
MODEL_DIR = Path("models/multilabel/") |
|
DATASET_DIR = Path("data/processed/dataset_multilabel_top30") |
|
TOP_RULES_PATH = Path("data/metadata/top_rules.json") |
|
|
|
OUT_DIR = MODEL_DIR |
|
REPORT_CSV = OUT_DIR / "classification_report.csv" |
|
REPORT_JSON = OUT_DIR / "metrics.json" |
|
CONF_MATRIX_PNG = OUT_DIR / "confusion_matrix_multilabel.png" |
|
|
|
|
|
def collate_fn(batch): |
|
batch = default_collate(batch) |
|
batch["labels"] = batch["labels"].float() |
|
return batch |
|
|
|
|
|
with open(TOP_RULES_PATH) as f: |
|
top_rules = json.load(f) |
|
|
|
|
|
print("📂 Wczytywanie modelu...") |
|
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR) |
|
|
|
try: |
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR) |
|
except: |
|
print("⚠️ Brak tokenizera w modelu — pobieram z microsoft/codebert-base") |
|
tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base") |
|
tokenizer.save_pretrained(MODEL_DIR) |
|
|
|
|
|
dataset = load_from_disk(str(DATASET_DIR)) |
|
trainer = Trainer(model=model, data_collator=collate_fn) |
|
|
|
|
|
print("🔍 Predykcja na zbiorze testowym...") |
|
predictions = trainer.predict(dataset["test"].with_format("torch")) |
|
probs = torch.sigmoid(torch.tensor(predictions.predictions)).numpy() |
|
y_pred = (probs > 0.5).astype(int) |
|
y_true = predictions.label_ids |
|
|
|
|
|
print("📊 Raport klasyfikacji:") |
|
report_dict = classification_report( |
|
y_true, |
|
y_pred, |
|
target_names=top_rules, |
|
zero_division=0, |
|
output_dict=True |
|
) |
|
report_text = classification_report(y_true, y_pred, target_names=top_rules, zero_division=0) |
|
print(report_text) |
|
|
|
|
|
pd.DataFrame(report_dict).transpose().to_csv(REPORT_CSV) |
|
with open(REPORT_JSON, "w") as f: |
|
json.dump(report_dict, f, indent=2) |
|
|
|
print(f"💾 Zapisano raport CSV: {REPORT_CSV}") |
|
print(f"💾 Zapisano metryki JSON: {REPORT_JSON}") |
|
|
|
|
|
print("🧱 Generuję multilabel confusion matrix...") |
|
mcm = multilabel_confusion_matrix(y_true, y_pred) |
|
support = y_true.sum(axis=0).astype(int) |
|
|
|
fig, ax = plt.subplots(figsize=(12, 8)) |
|
bars = plt.barh(range(len(top_rules)), support) |
|
plt.yticks(range(len(top_rules)), top_rules) |
|
plt.xlabel("Liczba wystąpień w zbiorze testowym") |
|
plt.title("🔢 Rozkład występowania reguł w testowym zbiorze") |
|
|
|
for i, bar in enumerate(bars): |
|
width = bar.get_width() |
|
plt.text(width + 1, bar.get_y() + bar.get_height() / 2, str(support[i]), va='center') |
|
|
|
plt.tight_layout() |
|
plt.savefig(CONF_MATRIX_PNG) |
|
plt.close() |
|
print(f"🖼️ Zapisano confusion matrix jako PNG: {CONF_MATRIX_PNG}") |
|
|