# scripts/09.2_prepare_multilabel_dataset.py import json from pathlib import Path from collections import Counter from sklearn.model_selection import train_test_split from datasets import Dataset, DatasetDict from transformers import AutoTokenizer # === Konfiguracja INPUT_PATH = Path("data/labeled/labeled_dockerfiles.jsonl") TOP_RULES_PATH = Path("data/metadata/top_rules.json") OUTPUT_DIR = Path("data/processed/dataset_multilabel_top30") TOKENIZER_NAME = "microsoft/codebert-base" MAX_LENGTH = 512 SEED = 42 def load_top_rules(): with open(TOP_RULES_PATH, encoding="utf-8") as f: return json.load(f) def build_dataset(records, top_rules): rule2id = {r: i for i, r in enumerate(top_rules)} data = [] for row in records: if row.get("label") != "bad": continue triggered = row.get("rules_triggered", []) multilabel = [0] * len(top_rules) matched = False for rule in triggered: if rule in rule2id: multilabel[rule2id[rule]] = 1 matched = True if not matched: continue data.append({ "text": "\n".join(row["content"]) if isinstance(row["content"], list) else str(row["content"]), "labels": multilabel, "meta_lines": row.get("lines", {}), "meta_fixes": row.get("fixes", {}) }) return data def main(): print("📥 Wczytywanie danych...") top_rules = load_top_rules() print(f"🔝 Top {len(top_rules)} reguł: {top_rules}") with INPUT_PATH.open(encoding="utf-8") as f: records = [json.loads(line) for line in f if line.strip()] dataset = build_dataset(records, top_rules) print(f"📦 Zbudowano {len(dataset)} przykładów multilabel.") if not dataset: print("❌ Brak danych do przetworzenia. Sprawdź dane wejściowe.") return print("🔀 Podział na train/val/test...") train_val, test = train_test_split(dataset, test_size=0.1, random_state=SEED) train, val = train_test_split(train_val, test_size=0.1111, random_state=SEED) ds = DatasetDict({ "train": Dataset.from_list(train), "validation": Dataset.from_list(val), "test": Dataset.from_list(test), }) print("🔤 Tokenizacja...") tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME) def tokenize_function(batch): texts = [str(x) if x is not None else "" for x in batch["text"]] return tokenizer( texts, padding="max_length", truncation=True, max_length=MAX_LENGTH ) ds_tokenized = ds.map( tokenize_function, batched=True, remove_columns=["text", "meta_lines", "meta_fixes"] ) print(f"💾 Zapisuję do: {OUTPUT_DIR}") OUTPUT_DIR.mkdir(parents=True, exist_ok=True) ds_tokenized.save_to_disk(str(OUTPUT_DIR)) print("✅ Gotowe.") if __name__ == "__main__": main()