Add scripts

Browse files

Files changed (14) hide show

scripts/01_create_structure.py +17 -0
scripts/02_setup_env.sh +25 -0
scripts/03_fetch_github.py +179 -0
scripts/04_clone_and_extract.py +118 -0
scripts/05_generate_fixes.py +66 -0
scripts/06_label_with_fixes.py +175 -0
scripts/07_explore_labeled_dataset.py +126 -0
scripts/08_balance_dataset.py +86 -0
scripts/09.2_prepare_multilabel_dataset.py +100 -0
scripts/10.2_train_multilabel_model.py +101 -0
scripts/11.2_evaluate_multilabel.py +96 -0
scripts/12.2_predict_multilabel_file.py +88 -0
scripts/13.2_threshold_calibration.py +71 -0
scripts/14.2_generate_rules_description.py +63 -0

scripts/01_create_structure.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import os
+def create_dirs():
+    dirs = [
+        "data/raw",
+        "data/processed",
+        "data/metadata",
+        "models",
+        "notebooks",
+        "scripts"
+    ]
+    for d in dirs:
+        os.makedirs(d, exist_ok=True)
+        print(f"✔️ Created: {d}")
+if __name__ == "__main__":
+    create_dirs()

scripts/02_setup_env.sh ADDED Viewed

	@@ -0,0 +1,25 @@

+#!/bin/bash
+set -e  # zakończ jeśli błąd
+echo "📦 Aktualizacja systemu i zależności systemowych..."
+sudo apt update
+sudo apt install -y python3-venv python3-pip
+echo "📁 Tworzenie środowiska virtualenv..."
+python3 -m venv venv
+source venv/bin/activate
+echo "⬇️ Instalowanie zależności Python..."
+pip install --upgrade pip
+pip install -r scripts/requirements.txt
+echo "🔧 Sprawdzanie obecności gh CLI..."
+if ! command -v gh &> /dev/null; then
+    echo "➡️ Instaluję GitHub CLI..."
+    sudo apt install -y gh
+fi
+echo "🔐 Aby zalogować się do GitHub: gh auth login"
+echo "🔐 Aby sprawdzić status GitHub: gh auth status"
+echo "✅ Środowisko gotowe. Aktywuj przez: source venv/bin/activate"

scripts/03_fetch_github.py ADDED Viewed

	@@ -0,0 +1,179 @@

+# 03_fetch_github.py
+# Pobieranie repozytoriów z Dockerfile – v4
+# Użycie: python scripts/03_fetch_github.py --queries -1 --limit 500 --min_stars 3 --refresh --include_popular
+import argparse
+import json
+import subprocess
+import time
+from pathlib import Path
+from datetime import datetime
+# === Konfiguracja tematów i języków
+LANGUAGES = [
+    "python", "node", "go", "java", "rust", "php",
+    "ruby", "typescript", "csharp", "scala", "kotlin", "perl", "elixir", "swift"
+]
+TOPICS = [
+    "backend", "frontend", "production", "testing", "ci",
+    "ml", "devops", "containers", "docker", "cloud", "microservices"
+]
+GENERAL = [
+    "dockerfile", "docker container", "docker base image",
+    "multi stage dockerfile", "dockerfile slim", "dockerfile devcontainer",
+    "dockerfile ubuntu", "dockerfile alpine", "dockerfile debian"
+]
+DEFAULT_QUERIES = [
+    "dockerfile python", "dockerfile node", "dockerfile typescript", "dockerfile javascript",
+    "dockerfile golang", "dockerfile rust", "dockerfile java", "dockerfile kotlin", "dockerfile scala",
+    "dockerfile php", "dockerfile ruby", "dockerfile csharp", "dockerfile dotnet", "dockerfile flask",
+    "dockerfile django", "dockerfile fastapi", "dockerfile express", "dockerfile springboot",
+    "dockerfile react", "dockerfile nextjs", "dockerfile vue", "dockerfile nuxt", "dockerfile svelte",
+    "dockerfile laravel", "dockerfile symfony", "dockerfile postgres", "dockerfile mysql",
+    "dockerfile mongodb", "dockerfile redis", "dockerfile nginx", "dockerfile apache",
+    "dockerfile api", "dockerfile backend", "dockerfile frontend", "dockerfile microservices",
+    "dockerfile monorepo", "dockerfile tensorflow", "dockerfile pytorch", "dockerfile huggingface",
+    "dockerfile kubernetes", "dockerfile helm", "dockerfile gitlab", "dockerfile cicd",
+    "dockerfile openshift", "dockerfile airflow", "dockerfile spark", "dockerfile jupyter",
+    "dockerfile anaconda", "dockerfile dockerhub", "dockerfile datascience",
+    "dockerfile databricks", "dockerfile github-actions", "dockerfile codequality"
+]
+SPECIAL_QUERIES = [
+    "dockerfile base image", "dockerfile ci", "dockerfile cicd",
+    "dockerfile templates", "dockerfile registry", "dockerfile minimal",
+    "dockerfile multi-stage", "dockerfile builder", "dockerfile github workflow",
+    "dockerfile production ready", "dockerfile examples", "dockerfile secure",
+    "dockerfile dotnet", "dockerfile rust", "dockerfile slim image",
+    "dockerfile cloud native", "dockerfile init", "dockerfile test image"
+]
+DEFAULT_OUTPUT_RAW = Path("data/metadata/repos_raw.json")
+DEFAULT_OUTPUT_FILTERED = Path("data/metadata/repos_filtered.json")
+DEFAULT_POPULAR_REPOS = Path("data/metadata/manual_popular_repos.json")
+def generate_queries():
+    queries = set()
+    queries.update(GENERAL)
+    queries.update(DEFAULT_QUERIES)
+    queries.update(SPECIAL_QUERIES)
+    for lang in LANGUAGES:
+        for topic in TOPICS:
+            queries.add(f"dockerfile {lang} {topic}")
+    return sorted(queries)
+def run_query(query, limit):
+    print(f"🔍 Szukam: {query}")
+    result = subprocess.run([
+        "gh", "search", "repos", query,
+        "--limit", str(limit),
+        "--json", "fullName,description,stargazersCount,updatedAt,createdAt,pushedAt,url"
+    ], capture_output=True, text=True)
+    if result.returncode != 0:
+        print(f"❌ Błąd zapytania: {result.stderr.strip()}")
+        return []
+    try:
+        data = json.loads(result.stdout)
+        if not data:
+            print(f"⚠️ Brak wyników dla: {query}")
+        return data
+    except Exception as e:
+        print(f"❌ Błąd JSON: {e}")
+        return []
+def deduplicate_and_filter(repos, min_stars, min_date):
+    seen = set()
+    filtered = []
+    for r in repos:
+        name = r["fullName"]
+        updated = datetime.strptime(r["updatedAt"][:10], "%Y-%m-%d")
+        if name in seen:
+            continue
+        if r["stargazersCount"] < min_stars:
+            continue
+        if updated < min_date:
+            continue
+        seen.add(name)
+        filtered.append(r)
+    return filtered
+def load_manual_popular_repos(path):
+    if not path.exists():
+        print(f"⚠️ Brak pliku: {path}")
+        return []
+    with open(path, "r") as f:
+        try:
+            data = json.load(f)
+            enriched = []
+            for r in data:
+                enriched.append({
+                    "fullName": r["fullName"],
+                    "url": r.get("url", ""),
+                    "description": r.get("description", ""),
+                    "stargazersCount": r.get("stargazersCount", 9999),
+                    "updatedAt": r.get("updatedAt", "2024-01-01T00:00:00Z"),
+                    "createdAt": r.get("createdAt", "2020-01-01T00:00:00Z"),
+                    "pushedAt": r.get("pushedAt", "2024-01-01T00:00:00Z")
+                })
+            return enriched
+        except Exception as e:
+            print(f"❌ Błąd wczytywania popularnych repozytoriów: {e}")
+            return []
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--raw_output", type=Path, default=DEFAULT_OUTPUT_RAW)
+    parser.add_argument("--filtered_output", type=Path, default=DEFAULT_OUTPUT_FILTERED)
+    parser.add_argument("--queries", type=int, default=-1)
+    parser.add_argument("--limit", type=int, default=100)
+    parser.add_argument("--min_stars", type=int, default=5)
+    parser.add_argument("--min_date", type=str, default="2021-01-01")
+    parser.add_argument("--refresh", action="store_true")
+    parser.add_argument("--include_popular", action="store_true")
+    parser.add_argument("--popular_file", type=Path, default=DEFAULT_POPULAR_REPOS)
+    args = parser.parse_args()
+    args.raw_output.parent.mkdir(parents=True, exist_ok=True)
+    args.filtered_output.parent.mkdir(parents=True, exist_ok=True)
+    min_date = datetime.strptime(args.min_date, "%Y-%m-%d")
+    if args.raw_output.exists() and not args.refresh:
+        print(f"ℹ️ Plik {args.raw_output} już istnieje. Użyj --refresh, aby nadpisać.")
+        return
+    all_queries = generate_queries()
+    queries = all_queries if args.queries == -1 else all_queries[:args.queries]
+    print(f"🧠 Wygenerowano {len(queries)} zapytań:")
+    for q in queries:
+        print("   •", q)
+    all_results = []
+    for idx, query in enumerate(queries, 1):
+        print(f"\n🔄 [{idx}/{len(queries)}]")
+        results = run_query(query, args.limit)
+        all_results.extend(results)
+        time.sleep(5)
+    if args.include_popular:
+        print(f"\n📌 Dodaję popularne repozytoria z pliku: {args.popular_file}")
+        all_results.extend(load_manual_popular_repos(args.popular_file))
+    print(f"\n📈 Łącznie zapytań: {len(queries)}")
+    print(f"📦 Surowych wyników: {len(all_results)}")
+    with open(args.raw_output, "w") as f:
+        json.dump(all_results, f, indent=2)
+    clean_repos = deduplicate_and_filter(all_results, args.min_stars, min_date)
+    with open(args.filtered_output, "w") as f:
+        json.dump(clean_repos, f, indent=2)
+    print(f"✅ Po filtracji: {len(clean_repos)} repozytoriów")
+    print(f"📁 Zapisano do: {args.filtered_output}")
+if __name__ == "__main__":
+    main()

scripts/04_clone_and_extract.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# 04_clone_and_extract.py
+# Ekstrakcja Dockerfile – wersja v3 (bez parsera, z poprawnym zapisem JSONL)
+import json
+import shutil
+import hashlib
+from pathlib import Path
+from git import Repo
+from datetime import datetime
+import argparse
+# === Ścieżki
+REPO_LIST_PATH = Path("data/metadata/repos_filtered.json")
+CLONE_DIR = Path("temp_repos")
+OUTPUT_FILE = Path("data/raw/dockerfiles.jsonl")
+OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
+REMOVE_DIRS = [".git", ".github", "docs", "tests", "__pycache__", ".idea", ".vscode"]
+def clean_repo(path: Path):
+    for d in REMOVE_DIRS:
+        shutil.rmtree(path / d, ignore_errors=True)
+def compute_sha1(text: str) -> str:
+    return hashlib.sha1(text.encode("utf-8")).hexdigest()
+def is_valid_dockerfile(path: Path) -> bool:
+    try:
+        text = path.read_text(encoding="utf-8").strip()
+        lines = [l.strip().lower() for l in text.splitlines() if l.strip()]
+        if len(lines) < 5 or path.stat().st_size > 200_000:
+            return False
+        top_lines = lines[:10]
+        has_from = any(l.startswith("from") for l in top_lines)
+        has_run = any(l.startswith(("run", "cmd", "copy")) for l in lines)
+        return has_from and has_run
+    except Exception as e:
+        print(f"⚠️  Błąd walidacji pliku {path}: {e}")
+        return False
+def find_dockerfiles(repo_path: Path) -> list[Path]:
+    return [
+        f for f in repo_path.rglob("*")
+        if f.name.lower() == "dockerfile" and f.is_file()
+    ]
+def clone_repo(url: str, full_name: str) -> Path | None:
+    dest = CLONE_DIR / full_name.replace("/", "__")
+    if dest.exists():
+        print(f"⚠️  Repo {full_name} już istnieje – pomijam klonowanie.")
+        return dest
+    try:
+        print(f"⬇️  Klonuję {full_name}...")
+        Repo.clone_from(url, dest, depth=1)
+        clean_repo(dest)
+        return dest
+    except Exception as e:
+        print(f"❌ Błąd klonowania {full_name}: {e}")
+        return None
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--purge", action="store_true", help="Usuń repozytorium po ekstrakcji")
+    args = parser.parse_args()
+    with open(REPO_LIST_PATH) as f:
+        repos = json.load(f)
+    saved, skipped = 0, 0
+    seen_hashes = set()
+    with open(OUTPUT_FILE, "w", encoding="utf-8") as out_f:
+        for repo in repos:
+            full_name = repo["fullName"]
+            url = repo["url"]
+            repo_path = clone_repo(url, full_name)
+            if not repo_path:
+                continue
+            for file in find_dockerfiles(repo_path):
+                if not is_valid_dockerfile(file):
+                    skipped += 1
+                    continue
+                try:
+                    lines = file.read_text(encoding="utf-8").strip().splitlines()
+                    lines = [l.rstrip() for l in lines if l.strip()]
+                    file_id = compute_sha1("\n".join(lines))
+                    if file_id in seen_hashes:
+                        skipped += 1
+                        continue
+                    seen_hashes.add(file_id)
+                    json.dump({
+                        "repo": full_name,
+                        "path": str(file.relative_to(repo_path)),
+                        "file_id": file_id,
+                        "content": lines,
+                        "size_bytes": file.stat().st_size,
+                        "line_count": len(lines),
+                        "valid": True,
+                        "cloned_at": datetime.now().isoformat()
+                    }, out_f)
+                    out_f.write("\n")
+                    saved += 1
+                except Exception as e:
+                    print(f"⚠️  Błąd przy zapisie {file}: {e}")
+                    skipped += 1
+            if args.purge:
+                shutil.rmtree(repo_path, ignore_errors=True)
+    print(f"\n✅ Zapisano {saved} poprawnych Dockerfile do {OUTPUT_FILE}")
+    print(f"🚫 Pominięto {skipped} plików (nieważne, błędne, zduplikowane)")
+if __name__ == "__main__":
+    main()

scripts/05_generate_fixes.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# 05_generate_fixes.py
+# Generowanie gotowego pliku fixes.json dla najczęstszych reguł Hadolinta
+import json
+from pathlib import Path
+# === Definicja poprawek dla znanych reguł Hadolinta ===
+fixes = {
+    "DL3008": "Use 'apt-get update' before 'apt-get install' to ensure package lists are current.",
+    "DL4006": "Combine RUN instructions using '&&' to reduce image layers and improve caching.",
+    "DL3003": "Use 'apt-get clean' and remove package lists after installing to reduce image size.",
+    "DL4000": "Use COPY instead of ADD unless you need archive unpacking or remote URL support.",
+    "DL3015": "Remove unnecessary packages and clean up temporary files after installation.",
+    "DL3047": "Use 'HEALTHCHECK' instead of a custom script or process polling.",
+    "DL3059": "Avoid installing packages with --no-install-recommends if not needed.",
+    "DL3009": "Delete the apt cache after installing packages using 'rm -rf /var/lib/apt/lists/*'.",
+    "DL3018": "Pin versions in apt-get install to ensure reproducibility.",
+    "SC2086": "Use quotes to prevent word splitting and globbing in shell commands.",
+    "DL3006": "Always tag the version of the base image (avoid using 'latest').",
+    "DL3020": "Avoid using URLs in COPY instructions; download files inside the container instead.",
+    "DL3025": "Use 'SHELL [\"/bin/bash\", \"-c\"]' for multi-line RUN with bash-specific syntax.",
+    "DL3042": "Avoid installing unnecessary packages; install only what is required.",
+    "DL3004": "Do not use sudo in Dockerfiles. Run as root or configure user permissions properly.",
+    "DL3013": "Specify version numbers in pip install commands to ensure reproducibility.",
+    "DL3027": "Avoid pip installing outside of a virtual environment when using Python.",
+    "DL3007": "Use absolute paths in COPY commands to avoid ambiguity.",
+    "SC2046": "Quote arguments to prevent word splitting when using command substitution.",
+    "DL3033": "Use meaningful and concise image labels using the LABEL instruction.",
+    "SC2028": "Echo with escaped characters may not behave as expected; quote the arguments.",
+    "DL3019": "Do not use apk upgrade as it may lead to unpredictable behavior.",
+    "DL4001": "Use WORKDIR to define working directory instead of cd.",
+    "DL3002": "Avoid using ADD to fetch archives; prefer COPY or RUN curl + tar.",
+    "SC2016": "Use backticks or $() in expressions to avoid confusion in shell scripts.",
+    "DL3048": "Use COPY instead of ADD unless unpacking or remote fetching is required.",
+    "DL3005": "Avoid using apt-key; use signed repositories and secure APT.",
+    "DL3045": "Use 'ARG' instead of hardcoded values to allow flexible builds.",
+    "DL3032": "Consolidate ENV declarations to reduce the number of image layers.",
+    "DL3016": "Pin versions when using curl to download remote content.",
+    "SC2035": "Use quotes to avoid globbing and unexpected matches in shell commands.",
+    "DL3041": "Use official or trusted base images whenever possible.",
+    "SC2043": "Quote expressions to avoid unintended behavior in conditions.",
+    "SC2155": "Declare and assign variables in separate steps to avoid masking return codes.",
+    "DL3028": "Use ADD only when its specific features are needed (e.g., auto-extract).",
+    "DL1000": "Use a valid Dockerfile syntax; check for missing instructions or arguments.",
+    "SC2164": "Use 'cd ... || exit' or check directory change status to avoid silent errors.",
+    "SC2006": "Use modern command substitution: $(...) instead of backticks.",
+    "DL3040": "Avoid hardcoding credentials or tokens in Dockerfiles.",
+    "DL3014": "Use virtual environments when installing Python packages.",
+    "DL3022": "Use apt-get with -y or --assume-yes to avoid interactive prompts.",
+    "SC3037": "Quote paths and variables to avoid issues with whitespace or globbing.",
+    "DL3000": "Use FROM as the first instruction in Dockerfile.",
+    "DL3029": "Use ADD or curl instead of COPY for downloading files.",
+    "SC1088": "Quote strings properly to avoid syntax errors in scripts.",
+    "SC3009": "Avoid using variables in redirections or pipelines unless necessary.",
+    "SC2251": "Use proper syntax when comparing strings in shell conditions.",
+    "SC1001": "Use POSIX-compliant syntax unless bash features are required.",
+    "SC3003": "Quote paths and variables consistently to avoid unexpected behavior.",
+    "SC1091": "Ensure files sourced with . or source exist and are accessible."
+}
+# === Zapis do pliku ===
+fixes_path = Path("data/fixes/fixes.json")
+fixes_path.parent.mkdir(parents=True, exist_ok=True)
+fixes_path.write_text(json.dumps(fixes, indent=2, ensure_ascii=False))
+print(f"✅ Zapisano {len(fixes)} reguł do {fixes_path}")

scripts/06_label_with_fixes.py ADDED Viewed

	@@ -0,0 +1,175 @@

+# 06_label_with_fixes.py – wersja v4
+import json
+import tempfile
+import subprocess
+from pathlib import Path
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from datetime import datetime
+# === Ścieżki ===
+INPUT_PATH = Path("data/raw/dockerfiles.jsonl")
+OUTPUT_PATH = Path("data/labeled/labeled_dockerfiles.jsonl")
+FAILED_LOG = Path("data/labeled/failed_dockerfiles.jsonl")
+MISSING_FIXES_LOG = Path("data/labeled/missing_fixes.txt")
+OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
+FIXES_PATH = Path("data/fixes/fixes.json")
+HADOLINT_BIN = "hadolint"
+MAX_WORKERS = 6
+TIMEOUT_SECONDS = 5
+# === Globalny słownik fixów ===
+with open(FIXES_PATH, encoding="utf-8") as f:
+    FIXES = json.load(f)
+MISSING_FIXES = set()
+def attach_fixes(rules_triggered: list[str]) -> dict:
+    suggestions = {}
+    for rule in rules_triggered:
+        if rule in FIXES:
+            suggestions[rule] = FIXES[rule]
+        else:
+            MISSING_FIXES.add(rule)
+    return suggestions
+def lint_dockerfile(entry: dict) -> dict:
+    try:
+        content = entry["content"]
+        joined = "\n".join(content)
+        with tempfile.NamedTemporaryFile("w", suffix=".Dockerfile", delete=False) as tmp:
+            tmp.write(joined)
+            tmp.flush()
+            temp_path = tmp.name
+        result = subprocess.run(
+            [HADOLINT_BIN, temp_path, "-f", "json"],
+            capture_output=True,
+            text=True,
+            timeout=TIMEOUT_SECONDS
+        )
+        Path(temp_path).unlink(missing_ok=True)
+        if result.returncode == 0:
+            return {
+                "label": "good",
+                "rules_triggered": [],
+                "lines": {},
+                "fix_suggestions": {},
+                "repo": entry["repo"],
+                "path": entry["path"],
+                "content": content,
+                "timestamp": datetime.now().isoformat()
+            }
+        try:
+            findings = json.loads(result.stdout)
+            rules = sorted(set(item["code"] for item in findings if "code" in item))
+            line_map = {}
+            for item in findings:
+                code = item.get("code")
+                line = item.get("line")
+                if code and line:
+                    line_map.setdefault(code, line)
+            fix_suggestions = attach_fixes(rules)
+        except Exception as e:
+            rules = ["lint-parse-error"]
+            line_map = {}
+            fix_suggestions = {}
+        return {
+            "label": "bad",
+            "rules_triggered": rules,
+            "lines": line_map,
+            "fix_suggestions": fix_suggestions,
+            "repo": entry["repo"],
+            "path": entry["path"],
+            "content": content,
+            "timestamp": datetime.now().isoformat()
+        }
+    except subprocess.TimeoutExpired:
+        return {
+            "label": "bad",
+            "rules_triggered": ["lint-timeout"],
+            "lines": {},
+            "fix_suggestions": {},
+            "repo": entry.get("repo"),
+            "path": entry.get("path"),
+            "content": entry.get("content"),
+            "timestamp": datetime.now().isoformat()
+        }
+    except Exception as e:
+        return {
+            "label": "bad",
+            "rules_triggered": [f"lint-error:{str(e)}"],
+            "lines": {},
+            "fix_suggestions": {},
+            "repo": entry.get("repo"),
+            "path": entry.get("path"),
+            "content": entry.get("content"),
+            "timestamp": datetime.now().isoformat()
+        }
+def main():
+    with open(INPUT_PATH, encoding="utf-8") as f:
+        records = [json.loads(line) for line in f if line.strip()]
+    print(f"🚀 Start analizy {len(records)} Dockerfile (wątki={MAX_WORKERS})")
+    results, failed = [], []
+    with ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
+        futures = [executor.submit(lint_dockerfile, row) for row in records]
+        for i, future in enumerate(as_completed(futures)):
+            try:
+                result = future.result()
+                if "rules_triggered" not in result:
+                    failed.append(result)
+                else:
+                    results.append(result)
+            except Exception as e:
+                failed.append({
+                    "label": "bad",
+                    "rules_triggered": [f"future-error:{str(e)}"],
+                    "lines": {},
+                    "fix_suggestions": {},
+                    "repo": "unknown",
+                    "path": "unknown",
+                    "content": [],
+                    "timestamp": datetime.now().isoformat()
+                })
+            if (i + 1) % 250 == 0:
+                print(f"  🔄 {i+1}/{len(records)} przetworzonych...")
+    with open(OUTPUT_PATH, "w", encoding="utf-8") as f_out:
+        for rec in results:
+            json.dump(rec, f_out)
+            f_out.write("\n")
+    with open(FAILED_LOG, "w", encoding="utf-8") as f_fail:
+        for rec in failed:
+            json.dump(rec, f_fail)
+            f_fail.write("\n")
+    if MISSING_FIXES:
+        print(f"\n⚠️ Brakuje fixów dla {len(MISSING_FIXES)} reguł – zapisuję do {MISSING_FIXES_LOG}")
+        with open(MISSING_FIXES_LOG, "w", encoding="utf-8") as f_miss:
+            for rule in sorted(MISSING_FIXES):
+                f_miss.write(rule + "\n")
+    else:
+        print("✅ Wszystkie reguły mają przypisany fix!")
+    print(f"\n✅ Zapisano {len(results)} Dockerfile z etykietami i fixami → {OUTPUT_PATH}")
+    print(f"❌ Nieudanych: {len(failed)} → {FAILED_LOG}")
+if __name__ == "__main__":
+    main()

scripts/07_explore_labeled_dataset.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# 08_explore_labeled_dataset_v4.py
+import json
+from pathlib import Path
+from collections import Counter
+import matplotlib.pyplot as plt
+import numpy as np
+# === Ścieżki i konfiguracja
+INPUT_PATH = Path("data/labeled/labeled_dockerfiles.jsonl")
+TOP_RULES_PATH = Path("data/metadata/top_rules.json")
+TOP_N = 30
+# === Inicjalizacja
+labels_counter = Counter()
+rules_counter = Counter()
+rules_per_file = []
+lines_with_errors_per_file = []
+lengths = []
+all_line_positions = []
+fixable_counter = 0
+unique_rules_with_fixes = set()
+print("🔍 Analizuję dane...")
+with open(INPUT_PATH, encoding="utf-8") as f:
+    for line in f:
+        obj = json.loads(line)
+        labels_counter[obj["label"]] += 1
+        if obj["label"] == "bad":
+            rules = obj.get("rules_triggered", [])
+            rules_counter.update(rules)
+            rules_per_file.append(len(rules))
+            # Fix analysis
+            fixes = obj.get("fix_suggestions", {})
+            if fixes:
+                fixable_counter += 1
+                unique_rules_with_fixes.update(fixes.keys())
+            # Linie błędów – v4
+            lines = obj.get("lines", {}).values()
+            line_set = set(lines)
+            lines_with_errors_per_file.append(len(line_set))
+            all_line_positions.extend(lines)
+        # Długość pliku
+        lengths.append(len(obj["content"]))
+# === Statystyki ogólne
+print("\n📊 Statystyki:")
+print(f"✅ Good: {labels_counter['good']}")
+print(f"❌ Bad:  {labels_counter['bad']}")
+print(f"🧩 Łączna liczba unikalnych reguł: {len(rules_counter)}")
+print(f"🛠 Plików z co najmniej jednym możliwym fixem: {fixable_counter}")
+print(f"🔧 Liczba unikalnych reguł z przypisanym fixem: {len(unique_rules_with_fixes)}")
+# === Top N reguł
+top_rules = rules_counter.most_common(TOP_N)
+print(f"\n🏆 Top {TOP_N} najczęściej łamanych reguł:")
+for code, count in top_rules:
+    print(f"  {code}: {count}x")
+# === Zapisz top N do pliku JSON
+TOP_RULES_PATH.parent.mkdir(parents=True, exist_ok=True)
+with open(TOP_RULES_PATH, "w", encoding="utf-8") as f:
+    json.dump([code for code, _ in top_rules], f, indent=2)
+print(f"\n💾 Zapisano top {TOP_N} reguł do {TOP_RULES_PATH}")
+# === Długości Dockerfile
+lengths_np = np.array(lengths)
+print(f"\n📏 Długość Dockerfile (linie):")
+print(f"  Średnia:  {lengths_np.mean():.2f}")
+print(f"  Mediana:  {np.median(lengths_np):.0f}")
+print(f"  Min:      {lengths_np.min()}")
+print(f"  Max:      {lengths_np.max()}")
+# === Histogramy
+Path("data/metadata").mkdir(parents=True, exist_ok=True)
+# 1. Długość plików
+plt.figure()
+plt.hist(lengths_np, bins=40, color="skyblue", edgecolor="black")
+plt.title("Rozkład długości Dockerfile")
+plt.xlabel("Liczba linii")
+plt.ylabel("Liczba plików")
+plt.grid(True)
+plt.tight_layout()
+plt.savefig("data/metadata/dockerfile_length_hist.png")
+# 2. Reguły na plik
+if rules_per_file:
+    plt.figure()
+    plt.hist(rules_per_file, bins=range(1, max(rules_per_file)+2), color="salmon", edgecolor="black")
+    plt.title("Liczba reguł naruszonych na plik")
+    plt.xlabel("Liczba reguł")
+    plt.ylabel("Liczba plików")
+    plt.grid(True)
+    plt.tight_layout()
+    plt.savefig("data/metadata/rules_per_file_hist.png")
+# 3. Linie błędów na plik
+if lines_with_errors_per_file:
+    plt.figure()
+    plt.hist(lines_with_errors_per_file, bins=range(1, max(lines_with_errors_per_file)+2), color="orchid", edgecolor="black")
+    plt.title("Liczba linii z błędami w pliku")
+    plt.xlabel("Liczba linii z błędami")
+    plt.ylabel("Liczba plików")
+    plt.grid(True)
+    plt.tight_layout()
+    plt.savefig("data/metadata/error_lines_per_file_hist.png")
+# 4. Rozkład pozycji błędów
+if all_line_positions:
+    plt.figure()
+    plt.hist(all_line_positions, bins=50, color="gold", edgecolor="black")
+    plt.title("Rozkład pozycji błędów (linie)")
+    plt.xlabel("Numer linii")
+    plt.ylabel("Liczba błędów")
+    plt.grid(True)
+    plt.tight_layout()
+    plt.savefig("data/metadata/line_positions_hist.png")
+print("\n📊 Zapisano wykresy do data/metadata/")

scripts/08_balance_dataset.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# 07_balance_dataset.py
+import json
+import random
+from pathlib import Path
+from collections import Counter
+import shutil
+# === Ścieżki
+INPUT_PATH = Path("data/labeled/labeled_dockerfiles.jsonl")
+BACKUP_PATH = Path("data/labeled/labeled_dockerfiles_backup.jsonl")
+TOP_RULES_PATH = Path("data/metadata/top_rules.json")
+OUTPUT_PATH = INPUT_PATH
+MAX_GOOD = 1500
+MAX_BAD = 15000
+TOP_N = 30
+# === Backup
+if INPUT_PATH.exists():
+    if not BACKUP_PATH.exists():
+        print(f"📦 Tworzę kopię zapasową → {BACKUP_PATH.name}")
+        shutil.copy(INPUT_PATH, BACKUP_PATH)
+    else:
+        print(f"ℹ️ Kopia zapasowa już istnieje: {BACKUP_PATH.name}")
+# === Wczytaj top 30 reguł
+with open(TOP_RULES_PATH, encoding="utf-8") as f:
+    top_rules = set(json.load(f)[:TOP_N])
+print(f"🏆 Używamy top {TOP_N} reguł")
+# === Wczytywanie danych
+print("🔍 Wczytywanie danych...")
+good_samples = []
+bad_samples = []
+with open(INPUT_PATH, encoding="utf-8") as f:
+    for line in f:
+        obj = json.loads(line)
+        if obj["label"] == "good":
+            good_samples.append(obj)
+        elif obj["label"] == "bad":
+            rules = set(obj.get("rules_triggered", []))
+            if rules & top_rules:
+                bad_samples.append(obj)
+print(f"✅ Good: {len(good_samples)} | ❌ Bad zawierające top {TOP_N} reguły: {len(bad_samples)}")
+# === Losowy wybór GOOD
+balanced_good = random.sample(good_samples, min(MAX_GOOD, len(good_samples)))
+# === Wybór BAD wg rzadkości top 30 reguł
+print("⚙️ Oceniam pliki BAD pod kątem rzadkości reguł...")
+rule_freq = Counter()
+for sample in bad_samples:
+    rules = sample.get("rules_triggered", [])
+    rule_freq.update(r for r in rules if r in top_rules)
+def compute_score(sample):
+    rules = set(sample.get("rules_triggered", [])) & top_rules
+    return sum(1 / rule_freq[r] for r in rules if rule_freq[r] > 0)
+scored_bad = sorted(
+    bad_samples,
+    key=lambda s: (
+        compute_score(s),
+        -len(set(s.get("rules_triggered", [])) & top_rules)
+    ),
+    reverse=True
+)
+balanced_bad = scored_bad[:MAX_BAD]
+# === Łączenie i zapis
+balanced_all = balanced_good + balanced_bad
+random.shuffle(balanced_all)
+OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
+with open(OUTPUT_PATH, "w", encoding="utf-8") as f_out:
+    for rec in balanced_all:
+        json.dump(rec, f_out)
+        f_out.write("\n")
+print(f"\n✅ Zapisano zbalansowany zbiór (tylko top {TOP_N} reguły): {len(balanced_all)} → {OUTPUT_PATH.name}")
+print(f"  - Good: {len(balanced_good)}")
+print(f"  - Bad:  {len(balanced_bad)}")

scripts/09.2_prepare_multilabel_dataset.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# scripts/09.2_prepare_multilabel_dataset.py
+import json
+from pathlib import Path
+from collections import Counter
+from sklearn.model_selection import train_test_split
+from datasets import Dataset, DatasetDict
+from transformers import AutoTokenizer
+# === Konfiguracja
+INPUT_PATH = Path("data/labeled/labeled_dockerfiles.jsonl")
+TOP_RULES_PATH = Path("data/metadata/top_rules.json")
+OUTPUT_DIR = Path("data/processed/dataset_multilabel_top30")
+TOKENIZER_NAME = "microsoft/codebert-base"
+MAX_LENGTH = 512
+SEED = 42
+def load_top_rules():
+    with open(TOP_RULES_PATH, encoding="utf-8") as f:
+        return json.load(f)
+def build_dataset(records, top_rules):
+    rule2id = {r: i for i, r in enumerate(top_rules)}
+    data = []
+    for row in records:
+        if row.get("label") != "bad":
+            continue
+        triggered = row.get("rules_triggered", [])
+        multilabel = [0] * len(top_rules)
+        matched = False
+        for rule in triggered:
+            if rule in rule2id:
+                multilabel[rule2id[rule]] = 1
+                matched = True
+        if not matched:
+            continue
+        data.append({
+            "text": "\n".join(row["content"]) if isinstance(row["content"], list) else str(row["content"]),
+            "labels": multilabel,
+            "meta_lines": row.get("lines", {}),
+            "meta_fixes": row.get("fixes", {})
+        })
+    return data
+def main():
+    print("📥 Wczytywanie danych...")
+    top_rules = load_top_rules()
+    print(f"🔝 Top {len(top_rules)} reguł: {top_rules}")
+    with INPUT_PATH.open(encoding="utf-8") as f:
+        records = [json.loads(line) for line in f if line.strip()]
+    dataset = build_dataset(records, top_rules)
+    print(f"📦 Zbudowano {len(dataset)} przykładów multilabel.")
+    if not dataset:
+        print("❌ Brak danych do przetworzenia. Sprawdź dane wejściowe.")
+        return
+    print("🔀 Podział na train/val/test...")
+    train_val, test = train_test_split(dataset, test_size=0.1, random_state=SEED)
+    train, val = train_test_split(train_val, test_size=0.1111, random_state=SEED)
+    ds = DatasetDict({
+        "train": Dataset.from_list(train),
+        "validation": Dataset.from_list(val),
+        "test": Dataset.from_list(test),
+    })
+    print("🔤 Tokenizacja...")
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
+    def tokenize_function(batch):
+        texts = [str(x) if x is not None else "" for x in batch["text"]]
+        return tokenizer(
+            texts,
+            padding="max_length",
+            truncation=True,
+            max_length=MAX_LENGTH
+        )
+    ds_tokenized = ds.map(
+        tokenize_function,
+        batched=True,
+        remove_columns=["text", "meta_lines", "meta_fixes"]
+    )
+    print(f"💾 Zapisuję do: {OUTPUT_DIR}")
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    ds_tokenized.save_to_disk(str(OUTPUT_DIR))
+    print("✅ Gotowe.")
+if __name__ == "__main__":
+    main()

scripts/10.2_train_multilabel_model.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# scripts/10.2_train_multilabel_model.py
+import os
+import json
+import numpy as np
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    TrainingArguments,
+    Trainer,
+    EarlyStoppingCallback,
+)
+from datasets import load_from_disk
+from torch.utils.data import default_collate
+from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
+# === Konfiguracja
+DATA_PATH = "data/processed/dataset_multilabel_top30"
+OUTPUT_DIR = "models/multilabel"
+MODEL_NAME = "microsoft/codebert-base"
+NUM_LABELS = 30
+NUM_EPOCHS = 12
+SEED = 42
+# === Ładowanie danych i tokenizera
+print("📂 Ładowanie danych i tokenizera...")
+ds = load_from_disk(DATA_PATH)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+# === Model
+print("🧠 Inicjalizacja modelu...")
+model = AutoModelForSequenceClassification.from_pretrained(
+    MODEL_NAME,
+    num_labels=NUM_LABELS,
+    problem_type="multi_label_classification"
+)
+# === Funkcja metryk
+def compute_metrics(pred):
+    logits, labels = pred
+    probs = 1 / (1 + np.exp(-logits))  # sigmoid
+    preds = (probs > 0.5).astype(int)
+    return {
+        "accuracy": accuracy_score(labels, preds),
+        "f1": f1_score(labels, preds, average="micro"),
+        "precision": precision_score(labels, preds, average="micro"),
+        "recall": recall_score(labels, preds, average="micro"),
+    }
+# === Batch collator: wymuszenie float32
+def collate_fn(batch):
+    batch = default_collate(batch)
+    batch["labels"] = batch["labels"].float()
+    return batch
+# === Argumenty treningowe
+args = TrainingArguments(
+    output_dir=OUTPUT_DIR,
+    evaluation_strategy="epoch",
+    save_strategy="epoch",
+    learning_rate=2e-5,
+    per_device_train_batch_size=8,
+    per_device_eval_batch_size=8,
+    num_train_epochs=NUM_EPOCHS,
+    weight_decay=0.01,
+    load_best_model_at_end=True,
+    save_total_limit=2,
+    seed=SEED,
+    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
+    logging_steps=50,
+    metric_for_best_model="f1",
+    greater_is_better=True,
+    report_to="none"
+)
+# === Trener
+trainer = Trainer(
+    model=model,
+    args=args,
+    train_dataset=ds["train"].with_format("torch"),
+    eval_dataset=ds["validation"].with_format("torch"),
+    tokenizer=tokenizer,
+    compute_metrics=compute_metrics,
+    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
+    data_collator=collate_fn,
+)
+# === Trening
+print("🚀 Start treningu...")
+trainer.train()
+# === Zapis modelu i logów
+print("💾 Zapisuję model i logi...")
+trainer.save_model(OUTPUT_DIR)
+log_path = os.path.join(OUTPUT_DIR, "training_log.json")
+with open(log_path, "w", encoding="utf-8") as f:
+    json.dump(trainer.state.log_history, f, indent=2)
+print(f"📝 Zapisano log treningu do {log_path}")
+print("✅ Gotowe.")

scripts/11.2_evaluate_multilabel.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# 11.2_evaluate_multilabel.py
+import os
+import json
+import torch
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from pathlib import Path
+from datasets import load_from_disk
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer
+from torch.utils.data import default_collate
+from sklearn.metrics import classification_report, multilabel_confusion_matrix
+# === Ścieżki
+MODEL_DIR = Path("models/multilabel/")
+DATASET_DIR = Path("data/processed/dataset_multilabel_top30")
+TOP_RULES_PATH = Path("data/metadata/top_rules.json")
+OUT_DIR = MODEL_DIR
+REPORT_CSV = OUT_DIR / "classification_report.csv"
+REPORT_JSON = OUT_DIR / "metrics.json"
+CONF_MATRIX_PNG = OUT_DIR / "confusion_matrix_multilabel.png"
+# === Data collator dla float32 labels
+def collate_fn(batch):
+    batch = default_collate(batch)
+    batch["labels"] = batch["labels"].float()
+    return batch
+# === Wczytanie top_rules
+with open(TOP_RULES_PATH) as f:
+    top_rules = json.load(f)
+# === Wczytaj model + tokenizer
+print("📂 Wczytywanie modelu...")
+model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
+try:
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
+except:
+    print("⚠️ Brak tokenizera w modelu — pobieram z microsoft/codebert-base")
+    tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
+    tokenizer.save_pretrained(MODEL_DIR)
+# === Wczytaj dane i stwórz Trainer
+dataset = load_from_disk(str(DATASET_DIR))
+trainer = Trainer(model=model, data_collator=collate_fn)
+# === Predykcja
+print("🔍 Predykcja na zbiorze testowym...")
+predictions = trainer.predict(dataset["test"].with_format("torch"))
+probs = torch.sigmoid(torch.tensor(predictions.predictions)).numpy()
+y_pred = (probs > 0.5).astype(int)
+y_true = predictions.label_ids
+# === Raport klasyfikacji
+print("📊 Raport klasyfikacji:")
+report_dict = classification_report(
+    y_true,
+    y_pred,
+    target_names=top_rules,
+    zero_division=0,
+    output_dict=True
+)
+report_text = classification_report(y_true, y_pred, target_names=top_rules, zero_division=0)
+print(report_text)
+# === Zapis raportów
+pd.DataFrame(report_dict).transpose().to_csv(REPORT_CSV)
+with open(REPORT_JSON, "w") as f:
+    json.dump(report_dict, f, indent=2)
+print(f"💾 Zapisano raport CSV: {REPORT_CSV}")
+print(f"💾 Zapisano metryki JSON: {REPORT_JSON}")
+# === Macierz błędów (sumaryczna)
+print("🧱 Generuję multilabel confusion matrix...")
+mcm = multilabel_confusion_matrix(y_true, y_pred)
+support = y_true.sum(axis=0).astype(int)
+fig, ax = plt.subplots(figsize=(12, 8))
+bars = plt.barh(range(len(top_rules)), support)
+plt.yticks(range(len(top_rules)), top_rules)
+plt.xlabel("Liczba wystąpień w zbiorze testowym")
+plt.title("🔢 Rozkład występowania reguł w testowym zbiorze")
+for i, bar in enumerate(bars):
+    width = bar.get_width()
+    plt.text(width + 1, bar.get_y() + bar.get_height() / 2, str(support[i]), va='center')
+plt.tight_layout()
+plt.savefig(CONF_MATRIX_PNG)
+plt.close()
+print(f"🖼️ Zapisano confusion matrix jako PNG: {CONF_MATRIX_PNG}")

scripts/12.2_predict_multilabel_file.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# 12.2_predict_multilabel_file.py
+# Użycie: python scripts/12.2_predict_multilabel_file.py test/Dockerfile --debug
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+import sys
+from pathlib import Path
+import numpy as np
+import json
+# === Ścieżki
+MODEL_DIR = Path("models/multilabel/")
+TOP_RULES_PATH = Path("data/metadata/top_rules.json")
+MAX_LENGTH = 512
+THRESHOLD = 0.5  # Próg detekcji
+# === Załaduj reguły
+def load_labels():
+    with open(TOP_RULES_PATH, encoding="utf-8") as f:
+        return json.load(f)
+# === Załaduj model i tokenizer
+def load_model_and_tokenizer():
+    if MODEL_DIR.exists():
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
+        model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
+    else:
+        raise FileNotFoundError(f"❌ Nie znaleziono katalogu z modelem: {MODEL_DIR}")
+    model.eval()
+    return tokenizer, model
+# === Predykcja
+def predict(filepath: Path, tokenizer, model, labels, threshold=THRESHOLD, debug=False):
+    text = filepath.read_text(encoding="utf-8")
+    inputs = tokenizer(
+        text,
+        return_tensors="pt",
+        truncation=True,
+        padding="max_length",
+        max_length=MAX_LENGTH
+    )
+    with torch.no_grad():
+        logits = model(**inputs).logits
+        probs = torch.sigmoid(logits).squeeze().cpu().numpy()
+    triggered = [(labels[i], probs[i]) for i in range(len(labels)) if probs[i] > threshold]
+    top5 = np.argsort(probs)[-5:][::-1]
+    print(f"\n🧪 Predykcja dla pliku: {filepath.name}")
+    print(f"📄 Długość pliku: {len(text.splitlines())} linii")
+    if triggered:
+        print(f"\n🚨 Wykryte reguły (p > {threshold}):")
+        for rule, p in triggered:
+            print(f" - {rule}: {p:.3f}")
+    else:
+        print("✅ Brak wykrytych problemów (żadna reguła nie przekroczyła progu)")
+    if debug:
+        print("\n🛠 DEBUG INFO:")
+        print(f"📝 Fragment tekstu:\n{text[:300]}")
+        print(f"🔢 Tokenów: {len(inputs['input_ids'][0])}")
+        print(f"📈 Logity: {logits.squeeze().tolist()}")
+        print("\n🔥 Top 5 predykcji:")
+        for idx in top5:
+            print(f" - {labels[idx]}: {probs[idx]:.3f}")
+# === Główna funkcja
+def main():
+    if len(sys.argv) < 2:
+        print("❌ Użycie: python scripts/12.2_predict_multilabel_file.py /ścieżka/do/pliku.Dockerfile [--debug]")
+        sys.exit(1)
+    filepath = Path(sys.argv[1])
+    debug = "--debug" in sys.argv
+    if not filepath.exists():
+        print(f"❌ Plik {filepath} nie istnieje.")
+        sys.exit(1)
+    labels = load_labels()
+    tokenizer, model = load_model_and_tokenizer()
+    predict(filepath, tokenizer, model, labels, debug=debug)
+if __name__ == "__main__":
+    main()

scripts/13.2_threshold_calibration.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# 13_threshold_calibration.py – dla multilabel v3
+import json
+import numpy as np
+import torch
+from datasets import load_from_disk
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer
+from sklearn.metrics import f1_score
+from pathlib import Path
+# === Ścieżki v3
+MODEL_DIR = Path("models/multilabel")
+DATASET_DIR = Path("data/processed/dataset_multilabel_top30")
+TOP_RULES_PATH = Path("data/metadata/top_rules.json")
+OUTPUT_PATH = MODEL_DIR / "thresholds.json"
+# === Wczytaj listę reguł
+with open(TOP_RULES_PATH, encoding="utf-8") as f:
+    labels = json.load(f)
+label_count = len(labels)
+# === Model i tokenizer
+model = AutoModelForSequenceClassification.from_pretrained(str(MODEL_DIR.resolve()))
+tokenizer = AutoTokenizer.from_pretrained(str(MODEL_DIR.resolve()))
+# === Trener z BCE loss
+class MultilabelTrainer(Trainer):
+    def compute_loss(self, model, inputs, return_outputs=False):
+        labels = inputs.pop("labels")
+        outputs = model(**inputs)
+        logits = outputs.logits
+        loss_fct = torch.nn.BCEWithLogitsLoss()
+        loss = loss_fct(logits, labels.float())
+        return (loss, outputs) if return_outputs else loss
+trainer = MultilabelTrainer(model=model)
+# === Walidacja
+ds = load_from_disk(str(DATASET_DIR.resolve()))
+val_dataset = ds["validation"]
+# === Predykcja
+print("🔍 Generowanie predykcji na zbiorze walidacyjnym...")
+predictions = trainer.predict(val_dataset)
+logits = torch.tensor(predictions.predictions)
+probs = torch.sigmoid(logits).numpy()
+y_true = predictions.label_ids
+# === Kalibracja
+print("⚙️ Kalibracja progów dla każdej reguły...")
+thresholds = {}
+search_space = np.arange(0.05, 0.96, 0.05)
+for i, label in enumerate(labels):
+    best_f1 = 0.0
+    best_thresh = 0.5
+    for t in search_space:
+        y_pred = (probs[:, i] > t).astype(int)
+        score = f1_score(y_true[:, i], y_pred, zero_division=0)
+        if score > best_f1:
+            best_f1 = score
+            best_thresh = round(t, 3)
+    thresholds[label] = best_thresh
+    print(f"📈 {label}: próg={best_thresh} (f1={best_f1:.4f})")
+# === Zapis progów
+OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
+with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
+    json.dump(thresholds, f, indent=2)
+print(f"\n✅ Zapisano progi do {OUTPUT_PATH}")

scripts/14.2_generate_rules_description.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# 14_generate_static_descriptions.py
+import json
+from pathlib import Path
+# === Top 30 reguł używanych w v4
+top_rules = [
+    "DL4006", "DL3008", "SC2086", "DL3003", "DL3015", "DL3047", "DL3009", "DL3004", "DL4001", "DL4000",
+    "DL3059", "DL3018", "SC2016", "SC2046", "DL3006", "SC2028", "DL3027", "DL3020", "DL3025", "DL3042",
+    "DL3013", "DL3007", "DL3033", "SC2043", "DL3019", "DL3005", "DL3002", "DL3048", "DL3045", "DL3032"
+]
+# === Opisy reguł — pobrane z dokumentacji hadolinta
+descriptions = {
+    "DL4006": "Set the SHELL option -o pipefail before using RUN with a pipe.",
+    "DL3008": "Pin versions in apt-get install. Avoid floating dependencies.",
+    "SC2086": "Double quote to prevent globbing and word splitting.",
+    "DL3003": "Use WORKDIR to switch to a directory instead of RUN cd.",
+    "DL3015": "Avoid installing unnecessary packages to keep the image lean.",
+    "DL3047": "Do not use latest tag for the base image.",
+    "DL3009": "Delete the apt-get lists after installing packages.",
+    "DL3004": "Do not use sudo as it leads to unpredictable behavior in containers.",
+    "DL4001": "Either use ADD for local tar archives or COPY for everything else.",
+    "DL4000": "MAINTAINER is deprecated. Use LABEL instead.",
+    "DL3059": "Multiple consecutive RUN instructions should be combined.",
+    "DL3018": "Pin versions in apk add commands.",
+    "SC2016": "Expressions don't expand in single quotes. Use double quotes.",
+    "SC2046": "Quote this to prevent word splitting.",
+    "DL3006": "Always tag the version of the base image explicitly.",
+    "SC2028": "Quotes in echo may not behave as expected. Use printf instead.",
+    "DL3027": "Use only an allowed registry in the FROM image.",
+    "DL3020": "Use COPY instead of ADD for files and folders.",
+    "DL3025": "Use COPY instead of ADD unless you need ADD's features.",
+    "DL3042": "Avoid cache busting by rearranging ADD/RUN order properly.",
+    "DL3013": "Avoid installing unnecessary packages in your container.",
+    "DL3007": "Using yum install is discouraged. Prefer apk or apt.",
+    "DL3033": "Specify version with pip install to ensure reproducibility.",
+    "SC2043": "Use 'case' instead of many 'if' statements for simplicity.",
+    "DL3019": "Do not use ADD with URLs; use curl or wget instead.",
+    "DL3005": "Do not use apt-get upgrade or dist-upgrade.",
+    "DL3002": "Last USER should not be root.",
+    "DL3048": "Avoid using the ADD instruction; prefer COPY.",
+    "DL3045": "Do not specify the same label multiple times.",
+    "DL3032": "Do not use deprecated ADD syntax; use COPY."
+}
+# === Tworzenie struktury JSON
+output_data = {}
+for rule in top_rules:
+    output_data[rule] = {
+        "code": rule,
+        "title": descriptions.get(rule, "No title available."),
+        "description": descriptions.get(rule, "No description available."),
+        "documentation": ""
+    }
+# === Zapis
+output_path = Path("data/metadata/rules_descriptions_en.json")
+output_path.parent.mkdir(parents=True, exist_ok=True)
+with open(output_path, "w", encoding="utf-8") as f:
+    json.dump(output_data, f, indent=2)
+print(f"✅ Zapisano {len(output_data)} reguł do {output_path}")