LeeSek commited on
Commit
097a740
·
verified ·
1 Parent(s): 09e00ff

Add scripts

Browse files
scripts/01_create_structure.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ def create_dirs():
4
+ dirs = [
5
+ "data/raw",
6
+ "data/processed",
7
+ "data/metadata",
8
+ "models",
9
+ "notebooks",
10
+ "scripts"
11
+ ]
12
+ for d in dirs:
13
+ os.makedirs(d, exist_ok=True)
14
+ print(f"✔️ Created: {d}")
15
+
16
+ if __name__ == "__main__":
17
+ create_dirs()
scripts/02_setup_env.sh ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ set -e # zakończ jeśli błąd
4
+
5
+ echo "📦 Aktualizacja systemu i zależności systemowych..."
6
+ sudo apt update
7
+ sudo apt install -y python3-venv python3-pip
8
+
9
+ echo "📁 Tworzenie środowiska virtualenv..."
10
+ python3 -m venv venv
11
+ source venv/bin/activate
12
+
13
+ echo "⬇️ Instalowanie zależności Python..."
14
+ pip install --upgrade pip
15
+ pip install -r scripts/requirements.txt
16
+
17
+ echo "🔧 Sprawdzanie obecności gh CLI..."
18
+ if ! command -v gh &> /dev/null; then
19
+ echo "➡️ Instaluję GitHub CLI..."
20
+ sudo apt install -y gh
21
+ fi
22
+
23
+ echo "🔐 Aby zalogować się do GitHub: gh auth login"
24
+ echo "🔐 Aby sprawdzić status GitHub: gh auth status"
25
+ echo "✅ Środowisko gotowe. Aktywuj przez: source venv/bin/activate"
scripts/03_fetch_github.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 03_fetch_github.py
2
+ # Pobieranie repozytoriów z Dockerfile – v4
3
+ # Użycie: python scripts/03_fetch_github.py --queries -1 --limit 500 --min_stars 3 --refresh --include_popular
4
+
5
+ import argparse
6
+ import json
7
+ import subprocess
8
+ import time
9
+ from pathlib import Path
10
+ from datetime import datetime
11
+
12
+ # === Konfiguracja tematów i języków
13
+ LANGUAGES = [
14
+ "python", "node", "go", "java", "rust", "php",
15
+ "ruby", "typescript", "csharp", "scala", "kotlin", "perl", "elixir", "swift"
16
+ ]
17
+ TOPICS = [
18
+ "backend", "frontend", "production", "testing", "ci",
19
+ "ml", "devops", "containers", "docker", "cloud", "microservices"
20
+ ]
21
+ GENERAL = [
22
+ "dockerfile", "docker container", "docker base image",
23
+ "multi stage dockerfile", "dockerfile slim", "dockerfile devcontainer",
24
+ "dockerfile ubuntu", "dockerfile alpine", "dockerfile debian"
25
+ ]
26
+ DEFAULT_QUERIES = [
27
+ "dockerfile python", "dockerfile node", "dockerfile typescript", "dockerfile javascript",
28
+ "dockerfile golang", "dockerfile rust", "dockerfile java", "dockerfile kotlin", "dockerfile scala",
29
+ "dockerfile php", "dockerfile ruby", "dockerfile csharp", "dockerfile dotnet", "dockerfile flask",
30
+ "dockerfile django", "dockerfile fastapi", "dockerfile express", "dockerfile springboot",
31
+ "dockerfile react", "dockerfile nextjs", "dockerfile vue", "dockerfile nuxt", "dockerfile svelte",
32
+ "dockerfile laravel", "dockerfile symfony", "dockerfile postgres", "dockerfile mysql",
33
+ "dockerfile mongodb", "dockerfile redis", "dockerfile nginx", "dockerfile apache",
34
+ "dockerfile api", "dockerfile backend", "dockerfile frontend", "dockerfile microservices",
35
+ "dockerfile monorepo", "dockerfile tensorflow", "dockerfile pytorch", "dockerfile huggingface",
36
+ "dockerfile kubernetes", "dockerfile helm", "dockerfile gitlab", "dockerfile cicd",
37
+ "dockerfile openshift", "dockerfile airflow", "dockerfile spark", "dockerfile jupyter",
38
+ "dockerfile anaconda", "dockerfile dockerhub", "dockerfile datascience",
39
+ "dockerfile databricks", "dockerfile github-actions", "dockerfile codequality"
40
+ ]
41
+ SPECIAL_QUERIES = [
42
+ "dockerfile base image", "dockerfile ci", "dockerfile cicd",
43
+ "dockerfile templates", "dockerfile registry", "dockerfile minimal",
44
+ "dockerfile multi-stage", "dockerfile builder", "dockerfile github workflow",
45
+ "dockerfile production ready", "dockerfile examples", "dockerfile secure",
46
+ "dockerfile dotnet", "dockerfile rust", "dockerfile slim image",
47
+ "dockerfile cloud native", "dockerfile init", "dockerfile test image"
48
+ ]
49
+
50
+ DEFAULT_OUTPUT_RAW = Path("data/metadata/repos_raw.json")
51
+ DEFAULT_OUTPUT_FILTERED = Path("data/metadata/repos_filtered.json")
52
+ DEFAULT_POPULAR_REPOS = Path("data/metadata/manual_popular_repos.json")
53
+
54
+ def generate_queries():
55
+ queries = set()
56
+ queries.update(GENERAL)
57
+ queries.update(DEFAULT_QUERIES)
58
+ queries.update(SPECIAL_QUERIES)
59
+
60
+ for lang in LANGUAGES:
61
+ for topic in TOPICS:
62
+ queries.add(f"dockerfile {lang} {topic}")
63
+
64
+ return sorted(queries)
65
+
66
+ def run_query(query, limit):
67
+ print(f"🔍 Szukam: {query}")
68
+ result = subprocess.run([
69
+ "gh", "search", "repos", query,
70
+ "--limit", str(limit),
71
+ "--json", "fullName,description,stargazersCount,updatedAt,createdAt,pushedAt,url"
72
+ ], capture_output=True, text=True)
73
+
74
+ if result.returncode != 0:
75
+ print(f"❌ Błąd zapytania: {result.stderr.strip()}")
76
+ return []
77
+
78
+ try:
79
+ data = json.loads(result.stdout)
80
+ if not data:
81
+ print(f"⚠️ Brak wyników dla: {query}")
82
+ return data
83
+ except Exception as e:
84
+ print(f"❌ Błąd JSON: {e}")
85
+ return []
86
+
87
+ def deduplicate_and_filter(repos, min_stars, min_date):
88
+ seen = set()
89
+ filtered = []
90
+ for r in repos:
91
+ name = r["fullName"]
92
+ updated = datetime.strptime(r["updatedAt"][:10], "%Y-%m-%d")
93
+ if name in seen:
94
+ continue
95
+ if r["stargazersCount"] < min_stars:
96
+ continue
97
+ if updated < min_date:
98
+ continue
99
+ seen.add(name)
100
+ filtered.append(r)
101
+ return filtered
102
+
103
+ def load_manual_popular_repos(path):
104
+ if not path.exists():
105
+ print(f"⚠️ Brak pliku: {path}")
106
+ return []
107
+
108
+ with open(path, "r") as f:
109
+ try:
110
+ data = json.load(f)
111
+ enriched = []
112
+ for r in data:
113
+ enriched.append({
114
+ "fullName": r["fullName"],
115
+ "url": r.get("url", ""),
116
+ "description": r.get("description", ""),
117
+ "stargazersCount": r.get("stargazersCount", 9999),
118
+ "updatedAt": r.get("updatedAt", "2024-01-01T00:00:00Z"),
119
+ "createdAt": r.get("createdAt", "2020-01-01T00:00:00Z"),
120
+ "pushedAt": r.get("pushedAt", "2024-01-01T00:00:00Z")
121
+ })
122
+ return enriched
123
+ except Exception as e:
124
+ print(f"❌ Błąd wczytywania popularnych repozytoriów: {e}")
125
+ return []
126
+
127
+ def main():
128
+ parser = argparse.ArgumentParser()
129
+ parser.add_argument("--raw_output", type=Path, default=DEFAULT_OUTPUT_RAW)
130
+ parser.add_argument("--filtered_output", type=Path, default=DEFAULT_OUTPUT_FILTERED)
131
+ parser.add_argument("--queries", type=int, default=-1)
132
+ parser.add_argument("--limit", type=int, default=100)
133
+ parser.add_argument("--min_stars", type=int, default=5)
134
+ parser.add_argument("--min_date", type=str, default="2021-01-01")
135
+ parser.add_argument("--refresh", action="store_true")
136
+ parser.add_argument("--include_popular", action="store_true")
137
+ parser.add_argument("--popular_file", type=Path, default=DEFAULT_POPULAR_REPOS)
138
+ args = parser.parse_args()
139
+
140
+ args.raw_output.parent.mkdir(parents=True, exist_ok=True)
141
+ args.filtered_output.parent.mkdir(parents=True, exist_ok=True)
142
+ min_date = datetime.strptime(args.min_date, "%Y-%m-%d")
143
+
144
+ if args.raw_output.exists() and not args.refresh:
145
+ print(f"ℹ️ Plik {args.raw_output} już istnieje. Użyj --refresh, aby nadpisać.")
146
+ return
147
+
148
+ all_queries = generate_queries()
149
+ queries = all_queries if args.queries == -1 else all_queries[:args.queries]
150
+
151
+ print(f"🧠 Wygenerowano {len(queries)} zapytań:")
152
+ for q in queries:
153
+ print(" •", q)
154
+
155
+ all_results = []
156
+ for idx, query in enumerate(queries, 1):
157
+ print(f"\n🔄 [{idx}/{len(queries)}]")
158
+ results = run_query(query, args.limit)
159
+ all_results.extend(results)
160
+ time.sleep(5)
161
+
162
+ if args.include_popular:
163
+ print(f"\n📌 Dodaję popularne repozytoria z pliku: {args.popular_file}")
164
+ all_results.extend(load_manual_popular_repos(args.popular_file))
165
+
166
+ print(f"\n📈 Łącznie zapytań: {len(queries)}")
167
+ print(f"📦 Surowych wyników: {len(all_results)}")
168
+ with open(args.raw_output, "w") as f:
169
+ json.dump(all_results, f, indent=2)
170
+
171
+ clean_repos = deduplicate_and_filter(all_results, args.min_stars, min_date)
172
+ with open(args.filtered_output, "w") as f:
173
+ json.dump(clean_repos, f, indent=2)
174
+
175
+ print(f"✅ Po filtracji: {len(clean_repos)} repozytoriów")
176
+ print(f"📁 Zapisano do: {args.filtered_output}")
177
+
178
+ if __name__ == "__main__":
179
+ main()
scripts/04_clone_and_extract.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 04_clone_and_extract.py
2
+ # Ekstrakcja Dockerfile – wersja v3 (bez parsera, z poprawnym zapisem JSONL)
3
+
4
+ import json
5
+ import shutil
6
+ import hashlib
7
+ from pathlib import Path
8
+ from git import Repo
9
+ from datetime import datetime
10
+ import argparse
11
+
12
+ # === Ścieżki
13
+ REPO_LIST_PATH = Path("data/metadata/repos_filtered.json")
14
+ CLONE_DIR = Path("temp_repos")
15
+ OUTPUT_FILE = Path("data/raw/dockerfiles.jsonl")
16
+ OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
17
+
18
+ REMOVE_DIRS = [".git", ".github", "docs", "tests", "__pycache__", ".idea", ".vscode"]
19
+
20
+ def clean_repo(path: Path):
21
+ for d in REMOVE_DIRS:
22
+ shutil.rmtree(path / d, ignore_errors=True)
23
+
24
+ def compute_sha1(text: str) -> str:
25
+ return hashlib.sha1(text.encode("utf-8")).hexdigest()
26
+
27
+ def is_valid_dockerfile(path: Path) -> bool:
28
+ try:
29
+ text = path.read_text(encoding="utf-8").strip()
30
+ lines = [l.strip().lower() for l in text.splitlines() if l.strip()]
31
+ if len(lines) < 5 or path.stat().st_size > 200_000:
32
+ return False
33
+ top_lines = lines[:10]
34
+ has_from = any(l.startswith("from") for l in top_lines)
35
+ has_run = any(l.startswith(("run", "cmd", "copy")) for l in lines)
36
+ return has_from and has_run
37
+ except Exception as e:
38
+ print(f"⚠️ Błąd walidacji pliku {path}: {e}")
39
+ return False
40
+
41
+ def find_dockerfiles(repo_path: Path) -> list[Path]:
42
+ return [
43
+ f for f in repo_path.rglob("*")
44
+ if f.name.lower() == "dockerfile" and f.is_file()
45
+ ]
46
+
47
+ def clone_repo(url: str, full_name: str) -> Path | None:
48
+ dest = CLONE_DIR / full_name.replace("/", "__")
49
+ if dest.exists():
50
+ print(f"⚠️ Repo {full_name} już istnieje – pomijam klonowanie.")
51
+ return dest
52
+ try:
53
+ print(f"⬇️ Klonuję {full_name}...")
54
+ Repo.clone_from(url, dest, depth=1)
55
+ clean_repo(dest)
56
+ return dest
57
+ except Exception as e:
58
+ print(f"❌ Błąd klonowania {full_name}: {e}")
59
+ return None
60
+
61
+ def main():
62
+ parser = argparse.ArgumentParser()
63
+ parser.add_argument("--purge", action="store_true", help="Usuń repozytorium po ekstrakcji")
64
+ args = parser.parse_args()
65
+
66
+ with open(REPO_LIST_PATH) as f:
67
+ repos = json.load(f)
68
+
69
+ saved, skipped = 0, 0
70
+ seen_hashes = set()
71
+
72
+ with open(OUTPUT_FILE, "w", encoding="utf-8") as out_f:
73
+ for repo in repos:
74
+ full_name = repo["fullName"]
75
+ url = repo["url"]
76
+ repo_path = clone_repo(url, full_name)
77
+ if not repo_path:
78
+ continue
79
+
80
+ for file in find_dockerfiles(repo_path):
81
+ if not is_valid_dockerfile(file):
82
+ skipped += 1
83
+ continue
84
+
85
+ try:
86
+ lines = file.read_text(encoding="utf-8").strip().splitlines()
87
+ lines = [l.rstrip() for l in lines if l.strip()]
88
+ file_id = compute_sha1("\n".join(lines))
89
+ if file_id in seen_hashes:
90
+ skipped += 1
91
+ continue
92
+ seen_hashes.add(file_id)
93
+
94
+ json.dump({
95
+ "repo": full_name,
96
+ "path": str(file.relative_to(repo_path)),
97
+ "file_id": file_id,
98
+ "content": lines,
99
+ "size_bytes": file.stat().st_size,
100
+ "line_count": len(lines),
101
+ "valid": True,
102
+ "cloned_at": datetime.now().isoformat()
103
+ }, out_f)
104
+ out_f.write("\n")
105
+ saved += 1
106
+
107
+ except Exception as e:
108
+ print(f"⚠️ Błąd przy zapisie {file}: {e}")
109
+ skipped += 1
110
+
111
+ if args.purge:
112
+ shutil.rmtree(repo_path, ignore_errors=True)
113
+
114
+ print(f"\n✅ Zapisano {saved} poprawnych Dockerfile do {OUTPUT_FILE}")
115
+ print(f"🚫 Pominięto {skipped} plików (nieważne, błędne, zduplikowane)")
116
+
117
+ if __name__ == "__main__":
118
+ main()
scripts/05_generate_fixes.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 05_generate_fixes.py
2
+ # Generowanie gotowego pliku fixes.json dla najczęstszych reguł Hadolinta
3
+
4
+ import json
5
+ from pathlib import Path
6
+
7
+ # === Definicja poprawek dla znanych reguł Hadolinta ===
8
+ fixes = {
9
+ "DL3008": "Use 'apt-get update' before 'apt-get install' to ensure package lists are current.",
10
+ "DL4006": "Combine RUN instructions using '&&' to reduce image layers and improve caching.",
11
+ "DL3003": "Use 'apt-get clean' and remove package lists after installing to reduce image size.",
12
+ "DL4000": "Use COPY instead of ADD unless you need archive unpacking or remote URL support.",
13
+ "DL3015": "Remove unnecessary packages and clean up temporary files after installation.",
14
+ "DL3047": "Use 'HEALTHCHECK' instead of a custom script or process polling.",
15
+ "DL3059": "Avoid installing packages with --no-install-recommends if not needed.",
16
+ "DL3009": "Delete the apt cache after installing packages using 'rm -rf /var/lib/apt/lists/*'.",
17
+ "DL3018": "Pin versions in apt-get install to ensure reproducibility.",
18
+ "SC2086": "Use quotes to prevent word splitting and globbing in shell commands.",
19
+ "DL3006": "Always tag the version of the base image (avoid using 'latest').",
20
+ "DL3020": "Avoid using URLs in COPY instructions; download files inside the container instead.",
21
+ "DL3025": "Use 'SHELL [\"/bin/bash\", \"-c\"]' for multi-line RUN with bash-specific syntax.",
22
+ "DL3042": "Avoid installing unnecessary packages; install only what is required.",
23
+ "DL3004": "Do not use sudo in Dockerfiles. Run as root or configure user permissions properly.",
24
+ "DL3013": "Specify version numbers in pip install commands to ensure reproducibility.",
25
+ "DL3027": "Avoid pip installing outside of a virtual environment when using Python.",
26
+ "DL3007": "Use absolute paths in COPY commands to avoid ambiguity.",
27
+ "SC2046": "Quote arguments to prevent word splitting when using command substitution.",
28
+ "DL3033": "Use meaningful and concise image labels using the LABEL instruction.",
29
+ "SC2028": "Echo with escaped characters may not behave as expected; quote the arguments.",
30
+ "DL3019": "Do not use apk upgrade as it may lead to unpredictable behavior.",
31
+ "DL4001": "Use WORKDIR to define working directory instead of cd.",
32
+ "DL3002": "Avoid using ADD to fetch archives; prefer COPY or RUN curl + tar.",
33
+ "SC2016": "Use backticks or $() in expressions to avoid confusion in shell scripts.",
34
+ "DL3048": "Use COPY instead of ADD unless unpacking or remote fetching is required.",
35
+ "DL3005": "Avoid using apt-key; use signed repositories and secure APT.",
36
+ "DL3045": "Use 'ARG' instead of hardcoded values to allow flexible builds.",
37
+ "DL3032": "Consolidate ENV declarations to reduce the number of image layers.",
38
+ "DL3016": "Pin versions when using curl to download remote content.",
39
+ "SC2035": "Use quotes to avoid globbing and unexpected matches in shell commands.",
40
+ "DL3041": "Use official or trusted base images whenever possible.",
41
+ "SC2043": "Quote expressions to avoid unintended behavior in conditions.",
42
+ "SC2155": "Declare and assign variables in separate steps to avoid masking return codes.",
43
+ "DL3028": "Use ADD only when its specific features are needed (e.g., auto-extract).",
44
+ "DL1000": "Use a valid Dockerfile syntax; check for missing instructions or arguments.",
45
+ "SC2164": "Use 'cd ... || exit' or check directory change status to avoid silent errors.",
46
+ "SC2006": "Use modern command substitution: $(...) instead of backticks.",
47
+ "DL3040": "Avoid hardcoding credentials or tokens in Dockerfiles.",
48
+ "DL3014": "Use virtual environments when installing Python packages.",
49
+ "DL3022": "Use apt-get with -y or --assume-yes to avoid interactive prompts.",
50
+ "SC3037": "Quote paths and variables to avoid issues with whitespace or globbing.",
51
+ "DL3000": "Use FROM as the first instruction in Dockerfile.",
52
+ "DL3029": "Use ADD or curl instead of COPY for downloading files.",
53
+ "SC1088": "Quote strings properly to avoid syntax errors in scripts.",
54
+ "SC3009": "Avoid using variables in redirections or pipelines unless necessary.",
55
+ "SC2251": "Use proper syntax when comparing strings in shell conditions.",
56
+ "SC1001": "Use POSIX-compliant syntax unless bash features are required.",
57
+ "SC3003": "Quote paths and variables consistently to avoid unexpected behavior.",
58
+ "SC1091": "Ensure files sourced with . or source exist and are accessible."
59
+ }
60
+
61
+ # === Zapis do pliku ===
62
+ fixes_path = Path("data/fixes/fixes.json")
63
+ fixes_path.parent.mkdir(parents=True, exist_ok=True)
64
+ fixes_path.write_text(json.dumps(fixes, indent=2, ensure_ascii=False))
65
+
66
+ print(f"✅ Zapisano {len(fixes)} reguł do {fixes_path}")
scripts/06_label_with_fixes.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 06_label_with_fixes.py – wersja v4
2
+
3
+ import json
4
+ import tempfile
5
+ import subprocess
6
+ from pathlib import Path
7
+ from concurrent.futures import ProcessPoolExecutor, as_completed
8
+ from datetime import datetime
9
+
10
+ # === Ścieżki ===
11
+ INPUT_PATH = Path("data/raw/dockerfiles.jsonl")
12
+ OUTPUT_PATH = Path("data/labeled/labeled_dockerfiles.jsonl")
13
+ FAILED_LOG = Path("data/labeled/failed_dockerfiles.jsonl")
14
+ MISSING_FIXES_LOG = Path("data/labeled/missing_fixes.txt")
15
+ OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
16
+
17
+ FIXES_PATH = Path("data/fixes/fixes.json")
18
+ HADOLINT_BIN = "hadolint"
19
+ MAX_WORKERS = 6
20
+ TIMEOUT_SECONDS = 5
21
+
22
+ # === Globalny słownik fixów ===
23
+ with open(FIXES_PATH, encoding="utf-8") as f:
24
+ FIXES = json.load(f)
25
+
26
+ MISSING_FIXES = set()
27
+
28
+ def attach_fixes(rules_triggered: list[str]) -> dict:
29
+ suggestions = {}
30
+ for rule in rules_triggered:
31
+ if rule in FIXES:
32
+ suggestions[rule] = FIXES[rule]
33
+ else:
34
+ MISSING_FIXES.add(rule)
35
+ return suggestions
36
+
37
+ def lint_dockerfile(entry: dict) -> dict:
38
+ try:
39
+ content = entry["content"]
40
+ joined = "\n".join(content)
41
+
42
+ with tempfile.NamedTemporaryFile("w", suffix=".Dockerfile", delete=False) as tmp:
43
+ tmp.write(joined)
44
+ tmp.flush()
45
+ temp_path = tmp.name
46
+
47
+ result = subprocess.run(
48
+ [HADOLINT_BIN, temp_path, "-f", "json"],
49
+ capture_output=True,
50
+ text=True,
51
+ timeout=TIMEOUT_SECONDS
52
+ )
53
+
54
+ Path(temp_path).unlink(missing_ok=True)
55
+
56
+ if result.returncode == 0:
57
+ return {
58
+ "label": "good",
59
+ "rules_triggered": [],
60
+ "lines": {},
61
+ "fix_suggestions": {},
62
+ "repo": entry["repo"],
63
+ "path": entry["path"],
64
+ "content": content,
65
+ "timestamp": datetime.now().isoformat()
66
+ }
67
+
68
+ try:
69
+ findings = json.loads(result.stdout)
70
+ rules = sorted(set(item["code"] for item in findings if "code" in item))
71
+ line_map = {}
72
+ for item in findings:
73
+ code = item.get("code")
74
+ line = item.get("line")
75
+ if code and line:
76
+ line_map.setdefault(code, line)
77
+
78
+ fix_suggestions = attach_fixes(rules)
79
+
80
+ except Exception as e:
81
+ rules = ["lint-parse-error"]
82
+ line_map = {}
83
+ fix_suggestions = {}
84
+
85
+ return {
86
+ "label": "bad",
87
+ "rules_triggered": rules,
88
+ "lines": line_map,
89
+ "fix_suggestions": fix_suggestions,
90
+ "repo": entry["repo"],
91
+ "path": entry["path"],
92
+ "content": content,
93
+ "timestamp": datetime.now().isoformat()
94
+ }
95
+
96
+ except subprocess.TimeoutExpired:
97
+ return {
98
+ "label": "bad",
99
+ "rules_triggered": ["lint-timeout"],
100
+ "lines": {},
101
+ "fix_suggestions": {},
102
+ "repo": entry.get("repo"),
103
+ "path": entry.get("path"),
104
+ "content": entry.get("content"),
105
+ "timestamp": datetime.now().isoformat()
106
+ }
107
+
108
+ except Exception as e:
109
+ return {
110
+ "label": "bad",
111
+ "rules_triggered": [f"lint-error:{str(e)}"],
112
+ "lines": {},
113
+ "fix_suggestions": {},
114
+ "repo": entry.get("repo"),
115
+ "path": entry.get("path"),
116
+ "content": entry.get("content"),
117
+ "timestamp": datetime.now().isoformat()
118
+ }
119
+
120
+ def main():
121
+ with open(INPUT_PATH, encoding="utf-8") as f:
122
+ records = [json.loads(line) for line in f if line.strip()]
123
+
124
+ print(f"🚀 Start analizy {len(records)} Dockerfile (wątki={MAX_WORKERS})")
125
+
126
+ results, failed = [], []
127
+
128
+ with ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
129
+ futures = [executor.submit(lint_dockerfile, row) for row in records]
130
+
131
+ for i, future in enumerate(as_completed(futures)):
132
+ try:
133
+ result = future.result()
134
+ if "rules_triggered" not in result:
135
+ failed.append(result)
136
+ else:
137
+ results.append(result)
138
+ except Exception as e:
139
+ failed.append({
140
+ "label": "bad",
141
+ "rules_triggered": [f"future-error:{str(e)}"],
142
+ "lines": {},
143
+ "fix_suggestions": {},
144
+ "repo": "unknown",
145
+ "path": "unknown",
146
+ "content": [],
147
+ "timestamp": datetime.now().isoformat()
148
+ })
149
+
150
+ if (i + 1) % 250 == 0:
151
+ print(f" 🔄 {i+1}/{len(records)} przetworzonych...")
152
+
153
+ with open(OUTPUT_PATH, "w", encoding="utf-8") as f_out:
154
+ for rec in results:
155
+ json.dump(rec, f_out)
156
+ f_out.write("\n")
157
+
158
+ with open(FAILED_LOG, "w", encoding="utf-8") as f_fail:
159
+ for rec in failed:
160
+ json.dump(rec, f_fail)
161
+ f_fail.write("\n")
162
+
163
+ if MISSING_FIXES:
164
+ print(f"\n⚠️ Brakuje fixów dla {len(MISSING_FIXES)} reguł – zapisuję do {MISSING_FIXES_LOG}")
165
+ with open(MISSING_FIXES_LOG, "w", encoding="utf-8") as f_miss:
166
+ for rule in sorted(MISSING_FIXES):
167
+ f_miss.write(rule + "\n")
168
+ else:
169
+ print("✅ Wszystkie reguły mają przypisany fix!")
170
+
171
+ print(f"\n✅ Zapisano {len(results)} Dockerfile z etykietami i fixami → {OUTPUT_PATH}")
172
+ print(f"❌ Nieudanych: {len(failed)} → {FAILED_LOG}")
173
+
174
+ if __name__ == "__main__":
175
+ main()
scripts/07_explore_labeled_dataset.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 08_explore_labeled_dataset_v4.py
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from collections import Counter
6
+ import matplotlib.pyplot as plt
7
+ import numpy as np
8
+
9
+ # === Ścieżki i konfiguracja
10
+ INPUT_PATH = Path("data/labeled/labeled_dockerfiles.jsonl")
11
+ TOP_RULES_PATH = Path("data/metadata/top_rules.json")
12
+ TOP_N = 30
13
+
14
+ # === Inicjalizacja
15
+ labels_counter = Counter()
16
+ rules_counter = Counter()
17
+ rules_per_file = []
18
+ lines_with_errors_per_file = []
19
+ lengths = []
20
+ all_line_positions = []
21
+
22
+ fixable_counter = 0
23
+ unique_rules_with_fixes = set()
24
+
25
+ print("🔍 Analizuję dane...")
26
+
27
+ with open(INPUT_PATH, encoding="utf-8") as f:
28
+ for line in f:
29
+ obj = json.loads(line)
30
+ labels_counter[obj["label"]] += 1
31
+
32
+ if obj["label"] == "bad":
33
+ rules = obj.get("rules_triggered", [])
34
+ rules_counter.update(rules)
35
+ rules_per_file.append(len(rules))
36
+
37
+ # Fix analysis
38
+ fixes = obj.get("fix_suggestions", {})
39
+ if fixes:
40
+ fixable_counter += 1
41
+ unique_rules_with_fixes.update(fixes.keys())
42
+
43
+ # Linie błędów – v4
44
+ lines = obj.get("lines", {}).values()
45
+ line_set = set(lines)
46
+ lines_with_errors_per_file.append(len(line_set))
47
+ all_line_positions.extend(lines)
48
+
49
+ # Długość pliku
50
+ lengths.append(len(obj["content"]))
51
+
52
+ # === Statystyki ogólne
53
+ print("\n📊 Statystyki:")
54
+ print(f"✅ Good: {labels_counter['good']}")
55
+ print(f"❌ Bad: {labels_counter['bad']}")
56
+ print(f"🧩 Łączna liczba unikalnych reguł: {len(rules_counter)}")
57
+ print(f"🛠 Plików z co najmniej jednym możliwym fixem: {fixable_counter}")
58
+ print(f"🔧 Liczba unikalnych reguł z przypisanym fixem: {len(unique_rules_with_fixes)}")
59
+
60
+ # === Top N reguł
61
+ top_rules = rules_counter.most_common(TOP_N)
62
+ print(f"\n🏆 Top {TOP_N} najczęściej łamanych reguł:")
63
+ for code, count in top_rules:
64
+ print(f" {code}: {count}x")
65
+
66
+ # === Zapisz top N do pliku JSON
67
+ TOP_RULES_PATH.parent.mkdir(parents=True, exist_ok=True)
68
+ with open(TOP_RULES_PATH, "w", encoding="utf-8") as f:
69
+ json.dump([code for code, _ in top_rules], f, indent=2)
70
+ print(f"\n💾 Zapisano top {TOP_N} reguł do {TOP_RULES_PATH}")
71
+
72
+ # === Długości Dockerfile
73
+ lengths_np = np.array(lengths)
74
+ print(f"\n📏 Długość Dockerfile (linie):")
75
+ print(f" Średnia: {lengths_np.mean():.2f}")
76
+ print(f" Mediana: {np.median(lengths_np):.0f}")
77
+ print(f" Min: {lengths_np.min()}")
78
+ print(f" Max: {lengths_np.max()}")
79
+
80
+ # === Histogramy
81
+ Path("data/metadata").mkdir(parents=True, exist_ok=True)
82
+
83
+ # 1. Długość plików
84
+ plt.figure()
85
+ plt.hist(lengths_np, bins=40, color="skyblue", edgecolor="black")
86
+ plt.title("Rozkład długości Dockerfile")
87
+ plt.xlabel("Liczba linii")
88
+ plt.ylabel("Liczba plików")
89
+ plt.grid(True)
90
+ plt.tight_layout()
91
+ plt.savefig("data/metadata/dockerfile_length_hist.png")
92
+
93
+ # 2. Reguły na plik
94
+ if rules_per_file:
95
+ plt.figure()
96
+ plt.hist(rules_per_file, bins=range(1, max(rules_per_file)+2), color="salmon", edgecolor="black")
97
+ plt.title("Liczba reguł naruszonych na plik")
98
+ plt.xlabel("Liczba reguł")
99
+ plt.ylabel("Liczba plików")
100
+ plt.grid(True)
101
+ plt.tight_layout()
102
+ plt.savefig("data/metadata/rules_per_file_hist.png")
103
+
104
+ # 3. Linie błędów na plik
105
+ if lines_with_errors_per_file:
106
+ plt.figure()
107
+ plt.hist(lines_with_errors_per_file, bins=range(1, max(lines_with_errors_per_file)+2), color="orchid", edgecolor="black")
108
+ plt.title("Liczba linii z błędami w pliku")
109
+ plt.xlabel("Liczba linii z błędami")
110
+ plt.ylabel("Liczba plików")
111
+ plt.grid(True)
112
+ plt.tight_layout()
113
+ plt.savefig("data/metadata/error_lines_per_file_hist.png")
114
+
115
+ # 4. Rozkład pozycji błędów
116
+ if all_line_positions:
117
+ plt.figure()
118
+ plt.hist(all_line_positions, bins=50, color="gold", edgecolor="black")
119
+ plt.title("Rozkład pozycji błędów (linie)")
120
+ plt.xlabel("Numer linii")
121
+ plt.ylabel("Liczba błędów")
122
+ plt.grid(True)
123
+ plt.tight_layout()
124
+ plt.savefig("data/metadata/line_positions_hist.png")
125
+
126
+ print("\n📊 Zapisano wykresy do data/metadata/")
scripts/08_balance_dataset.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 07_balance_dataset.py
2
+
3
+ import json
4
+ import random
5
+ from pathlib import Path
6
+ from collections import Counter
7
+ import shutil
8
+
9
+ # === Ścieżki
10
+ INPUT_PATH = Path("data/labeled/labeled_dockerfiles.jsonl")
11
+ BACKUP_PATH = Path("data/labeled/labeled_dockerfiles_backup.jsonl")
12
+ TOP_RULES_PATH = Path("data/metadata/top_rules.json")
13
+ OUTPUT_PATH = INPUT_PATH
14
+ MAX_GOOD = 1500
15
+ MAX_BAD = 15000
16
+ TOP_N = 30
17
+
18
+ # === Backup
19
+ if INPUT_PATH.exists():
20
+ if not BACKUP_PATH.exists():
21
+ print(f"📦 Tworzę kopię zapasową → {BACKUP_PATH.name}")
22
+ shutil.copy(INPUT_PATH, BACKUP_PATH)
23
+ else:
24
+ print(f"ℹ️ Kopia zapasowa już istnieje: {BACKUP_PATH.name}")
25
+
26
+ # === Wczytaj top 30 reguł
27
+ with open(TOP_RULES_PATH, encoding="utf-8") as f:
28
+ top_rules = set(json.load(f)[:TOP_N])
29
+ print(f"🏆 Używamy top {TOP_N} reguł")
30
+
31
+ # === Wczytywanie danych
32
+ print("🔍 Wczytywanie danych...")
33
+ good_samples = []
34
+ bad_samples = []
35
+
36
+ with open(INPUT_PATH, encoding="utf-8") as f:
37
+ for line in f:
38
+ obj = json.loads(line)
39
+ if obj["label"] == "good":
40
+ good_samples.append(obj)
41
+ elif obj["label"] == "bad":
42
+ rules = set(obj.get("rules_triggered", []))
43
+ if rules & top_rules:
44
+ bad_samples.append(obj)
45
+
46
+ print(f"✅ Good: {len(good_samples)} | ❌ Bad zawierające top {TOP_N} reguły: {len(bad_samples)}")
47
+
48
+ # === Losowy wybór GOOD
49
+ balanced_good = random.sample(good_samples, min(MAX_GOOD, len(good_samples)))
50
+
51
+ # === Wybór BAD wg rzadkości top 30 reguł
52
+ print("⚙️ Oceniam pliki BAD pod kątem rzadkości reguł...")
53
+
54
+ rule_freq = Counter()
55
+ for sample in bad_samples:
56
+ rules = sample.get("rules_triggered", [])
57
+ rule_freq.update(r for r in rules if r in top_rules)
58
+
59
+ def compute_score(sample):
60
+ rules = set(sample.get("rules_triggered", [])) & top_rules
61
+ return sum(1 / rule_freq[r] for r in rules if rule_freq[r] > 0)
62
+
63
+ scored_bad = sorted(
64
+ bad_samples,
65
+ key=lambda s: (
66
+ compute_score(s),
67
+ -len(set(s.get("rules_triggered", [])) & top_rules)
68
+ ),
69
+ reverse=True
70
+ )
71
+
72
+ balanced_bad = scored_bad[:MAX_BAD]
73
+
74
+ # === Łączenie i zapis
75
+ balanced_all = balanced_good + balanced_bad
76
+ random.shuffle(balanced_all)
77
+
78
+ OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
79
+ with open(OUTPUT_PATH, "w", encoding="utf-8") as f_out:
80
+ for rec in balanced_all:
81
+ json.dump(rec, f_out)
82
+ f_out.write("\n")
83
+
84
+ print(f"\n✅ Zapisano zbalansowany zbiór (tylko top {TOP_N} reguły): {len(balanced_all)} → {OUTPUT_PATH.name}")
85
+ print(f" - Good: {len(balanced_good)}")
86
+ print(f" - Bad: {len(balanced_bad)}")
scripts/09.2_prepare_multilabel_dataset.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # scripts/09.2_prepare_multilabel_dataset.py
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from collections import Counter
6
+ from sklearn.model_selection import train_test_split
7
+ from datasets import Dataset, DatasetDict
8
+ from transformers import AutoTokenizer
9
+
10
+ # === Konfiguracja
11
+ INPUT_PATH = Path("data/labeled/labeled_dockerfiles.jsonl")
12
+ TOP_RULES_PATH = Path("data/metadata/top_rules.json")
13
+ OUTPUT_DIR = Path("data/processed/dataset_multilabel_top30")
14
+ TOKENIZER_NAME = "microsoft/codebert-base"
15
+ MAX_LENGTH = 512
16
+ SEED = 42
17
+
18
+ def load_top_rules():
19
+ with open(TOP_RULES_PATH, encoding="utf-8") as f:
20
+ return json.load(f)
21
+
22
+ def build_dataset(records, top_rules):
23
+ rule2id = {r: i for i, r in enumerate(top_rules)}
24
+ data = []
25
+ for row in records:
26
+ if row.get("label") != "bad":
27
+ continue
28
+
29
+ triggered = row.get("rules_triggered", [])
30
+ multilabel = [0] * len(top_rules)
31
+ matched = False
32
+
33
+ for rule in triggered:
34
+ if rule in rule2id:
35
+ multilabel[rule2id[rule]] = 1
36
+ matched = True
37
+
38
+ if not matched:
39
+ continue
40
+
41
+ data.append({
42
+ "text": "\n".join(row["content"]) if isinstance(row["content"], list) else str(row["content"]),
43
+ "labels": multilabel,
44
+ "meta_lines": row.get("lines", {}),
45
+ "meta_fixes": row.get("fixes", {})
46
+ })
47
+
48
+ return data
49
+
50
+ def main():
51
+ print("📥 Wczytywanie danych...")
52
+ top_rules = load_top_rules()
53
+ print(f"🔝 Top {len(top_rules)} reguł: {top_rules}")
54
+
55
+ with INPUT_PATH.open(encoding="utf-8") as f:
56
+ records = [json.loads(line) for line in f if line.strip()]
57
+
58
+ dataset = build_dataset(records, top_rules)
59
+ print(f"📦 Zbudowano {len(dataset)} przykładów multilabel.")
60
+
61
+ if not dataset:
62
+ print("❌ Brak danych do przetworzenia. Sprawdź dane wejściowe.")
63
+ return
64
+
65
+ print("🔀 Podział na train/val/test...")
66
+ train_val, test = train_test_split(dataset, test_size=0.1, random_state=SEED)
67
+ train, val = train_test_split(train_val, test_size=0.1111, random_state=SEED)
68
+
69
+ ds = DatasetDict({
70
+ "train": Dataset.from_list(train),
71
+ "validation": Dataset.from_list(val),
72
+ "test": Dataset.from_list(test),
73
+ })
74
+
75
+ print("🔤 Tokenizacja...")
76
+ tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
77
+
78
+ def tokenize_function(batch):
79
+ texts = [str(x) if x is not None else "" for x in batch["text"]]
80
+ return tokenizer(
81
+ texts,
82
+ padding="max_length",
83
+ truncation=True,
84
+ max_length=MAX_LENGTH
85
+ )
86
+
87
+ ds_tokenized = ds.map(
88
+ tokenize_function,
89
+ batched=True,
90
+ remove_columns=["text", "meta_lines", "meta_fixes"]
91
+ )
92
+
93
+ print(f"💾 Zapisuję do: {OUTPUT_DIR}")
94
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
95
+ ds_tokenized.save_to_disk(str(OUTPUT_DIR))
96
+
97
+ print("✅ Gotowe.")
98
+
99
+ if __name__ == "__main__":
100
+ main()
scripts/10.2_train_multilabel_model.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # scripts/10.2_train_multilabel_model.py
2
+
3
+ import os
4
+ import json
5
+ import numpy as np
6
+ from transformers import (
7
+ AutoTokenizer,
8
+ AutoModelForSequenceClassification,
9
+ TrainingArguments,
10
+ Trainer,
11
+ EarlyStoppingCallback,
12
+ )
13
+ from datasets import load_from_disk
14
+ from torch.utils.data import default_collate
15
+ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
16
+
17
+ # === Konfiguracja
18
+ DATA_PATH = "data/processed/dataset_multilabel_top30"
19
+ OUTPUT_DIR = "models/multilabel"
20
+ MODEL_NAME = "microsoft/codebert-base"
21
+ NUM_LABELS = 30
22
+ NUM_EPOCHS = 12
23
+ SEED = 42
24
+
25
+ # === Ładowanie danych i tokenizera
26
+ print("📂 Ładowanie danych i tokenizera...")
27
+ ds = load_from_disk(DATA_PATH)
28
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
29
+
30
+ # === Model
31
+ print("🧠 Inicjalizacja modelu...")
32
+ model = AutoModelForSequenceClassification.from_pretrained(
33
+ MODEL_NAME,
34
+ num_labels=NUM_LABELS,
35
+ problem_type="multi_label_classification"
36
+ )
37
+
38
+ # === Funkcja metryk
39
+ def compute_metrics(pred):
40
+ logits, labels = pred
41
+ probs = 1 / (1 + np.exp(-logits)) # sigmoid
42
+ preds = (probs > 0.5).astype(int)
43
+ return {
44
+ "accuracy": accuracy_score(labels, preds),
45
+ "f1": f1_score(labels, preds, average="micro"),
46
+ "precision": precision_score(labels, preds, average="micro"),
47
+ "recall": recall_score(labels, preds, average="micro"),
48
+ }
49
+
50
+ # === Batch collator: wymuszenie float32
51
+ def collate_fn(batch):
52
+ batch = default_collate(batch)
53
+ batch["labels"] = batch["labels"].float()
54
+ return batch
55
+
56
+ # === Argumenty treningowe
57
+ args = TrainingArguments(
58
+ output_dir=OUTPUT_DIR,
59
+ evaluation_strategy="epoch",
60
+ save_strategy="epoch",
61
+ learning_rate=2e-5,
62
+ per_device_train_batch_size=8,
63
+ per_device_eval_batch_size=8,
64
+ num_train_epochs=NUM_EPOCHS,
65
+ weight_decay=0.01,
66
+ load_best_model_at_end=True,
67
+ save_total_limit=2,
68
+ seed=SEED,
69
+ logging_dir=os.path.join(OUTPUT_DIR, "logs"),
70
+ logging_steps=50,
71
+ metric_for_best_model="f1",
72
+ greater_is_better=True,
73
+ report_to="none"
74
+ )
75
+
76
+ # === Trener
77
+ trainer = Trainer(
78
+ model=model,
79
+ args=args,
80
+ train_dataset=ds["train"].with_format("torch"),
81
+ eval_dataset=ds["validation"].with_format("torch"),
82
+ tokenizer=tokenizer,
83
+ compute_metrics=compute_metrics,
84
+ callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
85
+ data_collator=collate_fn,
86
+ )
87
+
88
+ # === Trening
89
+ print("🚀 Start treningu...")
90
+ trainer.train()
91
+
92
+ # === Zapis modelu i logów
93
+ print("💾 Zapisuję model i logi...")
94
+ trainer.save_model(OUTPUT_DIR)
95
+
96
+ log_path = os.path.join(OUTPUT_DIR, "training_log.json")
97
+ with open(log_path, "w", encoding="utf-8") as f:
98
+ json.dump(trainer.state.log_history, f, indent=2)
99
+
100
+ print(f"📝 Zapisano log treningu do {log_path}")
101
+ print("✅ Gotowe.")
scripts/11.2_evaluate_multilabel.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 11.2_evaluate_multilabel.py
2
+
3
+ import os
4
+ import json
5
+ import torch
6
+ import numpy as np
7
+ import pandas as pd
8
+ import matplotlib.pyplot as plt
9
+
10
+ from pathlib import Path
11
+ from datasets import load_from_disk
12
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer
13
+ from torch.utils.data import default_collate
14
+ from sklearn.metrics import classification_report, multilabel_confusion_matrix
15
+
16
+ # === Ścieżki
17
+ MODEL_DIR = Path("models/multilabel/")
18
+ DATASET_DIR = Path("data/processed/dataset_multilabel_top30")
19
+ TOP_RULES_PATH = Path("data/metadata/top_rules.json")
20
+
21
+ OUT_DIR = MODEL_DIR
22
+ REPORT_CSV = OUT_DIR / "classification_report.csv"
23
+ REPORT_JSON = OUT_DIR / "metrics.json"
24
+ CONF_MATRIX_PNG = OUT_DIR / "confusion_matrix_multilabel.png"
25
+
26
+ # === Data collator dla float32 labels
27
+ def collate_fn(batch):
28
+ batch = default_collate(batch)
29
+ batch["labels"] = batch["labels"].float()
30
+ return batch
31
+
32
+ # === Wczytanie top_rules
33
+ with open(TOP_RULES_PATH) as f:
34
+ top_rules = json.load(f)
35
+
36
+ # === Wczytaj model + tokenizer
37
+ print("📂 Wczytywanie modelu...")
38
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
39
+
40
+ try:
41
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
42
+ except:
43
+ print("⚠️ Brak tokenizera w modelu — pobieram z microsoft/codebert-base")
44
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
45
+ tokenizer.save_pretrained(MODEL_DIR)
46
+
47
+ # === Wczytaj dane i stwórz Trainer
48
+ dataset = load_from_disk(str(DATASET_DIR))
49
+ trainer = Trainer(model=model, data_collator=collate_fn)
50
+
51
+ # === Predykcja
52
+ print("🔍 Predykcja na zbiorze testowym...")
53
+ predictions = trainer.predict(dataset["test"].with_format("torch"))
54
+ probs = torch.sigmoid(torch.tensor(predictions.predictions)).numpy()
55
+ y_pred = (probs > 0.5).astype(int)
56
+ y_true = predictions.label_ids
57
+
58
+ # === Raport klasyfikacji
59
+ print("📊 Raport klasyfikacji:")
60
+ report_dict = classification_report(
61
+ y_true,
62
+ y_pred,
63
+ target_names=top_rules,
64
+ zero_division=0,
65
+ output_dict=True
66
+ )
67
+ report_text = classification_report(y_true, y_pred, target_names=top_rules, zero_division=0)
68
+ print(report_text)
69
+
70
+ # === Zapis raportów
71
+ pd.DataFrame(report_dict).transpose().to_csv(REPORT_CSV)
72
+ with open(REPORT_JSON, "w") as f:
73
+ json.dump(report_dict, f, indent=2)
74
+
75
+ print(f"💾 Zapisano raport CSV: {REPORT_CSV}")
76
+ print(f"💾 Zapisano metryki JSON: {REPORT_JSON}")
77
+
78
+ # === Macierz błędów (sumaryczna)
79
+ print("🧱 Generuję multilabel confusion matrix...")
80
+ mcm = multilabel_confusion_matrix(y_true, y_pred)
81
+ support = y_true.sum(axis=0).astype(int)
82
+
83
+ fig, ax = plt.subplots(figsize=(12, 8))
84
+ bars = plt.barh(range(len(top_rules)), support)
85
+ plt.yticks(range(len(top_rules)), top_rules)
86
+ plt.xlabel("Liczba wystąpień w zbiorze testowym")
87
+ plt.title("🔢 Rozkład występowania reguł w testowym zbiorze")
88
+
89
+ for i, bar in enumerate(bars):
90
+ width = bar.get_width()
91
+ plt.text(width + 1, bar.get_y() + bar.get_height() / 2, str(support[i]), va='center')
92
+
93
+ plt.tight_layout()
94
+ plt.savefig(CONF_MATRIX_PNG)
95
+ plt.close()
96
+ print(f"🖼️ Zapisano confusion matrix jako PNG: {CONF_MATRIX_PNG}")
scripts/12.2_predict_multilabel_file.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 12.2_predict_multilabel_file.py
2
+ # Użycie: python scripts/12.2_predict_multilabel_file.py test/Dockerfile --debug
3
+
4
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
5
+ import torch
6
+ import sys
7
+ from pathlib import Path
8
+ import numpy as np
9
+ import json
10
+
11
+ # === Ścieżki
12
+ MODEL_DIR = Path("models/multilabel/")
13
+ TOP_RULES_PATH = Path("data/metadata/top_rules.json")
14
+ MAX_LENGTH = 512
15
+ THRESHOLD = 0.5 # Próg detekcji
16
+
17
+ # === Załaduj reguły
18
+ def load_labels():
19
+ with open(TOP_RULES_PATH, encoding="utf-8") as f:
20
+ return json.load(f)
21
+
22
+ # === Załaduj model i tokenizer
23
+ def load_model_and_tokenizer():
24
+ if MODEL_DIR.exists():
25
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
26
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
27
+ else:
28
+ raise FileNotFoundError(f"❌ Nie znaleziono katalogu z modelem: {MODEL_DIR}")
29
+ model.eval()
30
+ return tokenizer, model
31
+
32
+ # === Predykcja
33
+ def predict(filepath: Path, tokenizer, model, labels, threshold=THRESHOLD, debug=False):
34
+ text = filepath.read_text(encoding="utf-8")
35
+
36
+ inputs = tokenizer(
37
+ text,
38
+ return_tensors="pt",
39
+ truncation=True,
40
+ padding="max_length",
41
+ max_length=MAX_LENGTH
42
+ )
43
+
44
+ with torch.no_grad():
45
+ logits = model(**inputs).logits
46
+ probs = torch.sigmoid(logits).squeeze().cpu().numpy()
47
+
48
+ triggered = [(labels[i], probs[i]) for i in range(len(labels)) if probs[i] > threshold]
49
+ top5 = np.argsort(probs)[-5:][::-1]
50
+
51
+ print(f"\n🧪 Predykcja dla pliku: {filepath.name}")
52
+ print(f"📄 Długość pliku: {len(text.splitlines())} linii")
53
+
54
+ if triggered:
55
+ print(f"\n🚨 Wykryte reguły (p > {threshold}):")
56
+ for rule, p in triggered:
57
+ print(f" - {rule}: {p:.3f}")
58
+ else:
59
+ print("✅ Brak wykrytych problemów (żadna reguła nie przekroczyła progu)")
60
+
61
+ if debug:
62
+ print("\n🛠 DEBUG INFO:")
63
+ print(f"📝 Fragment tekstu:\n{text[:300]}")
64
+ print(f"🔢 Tokenów: {len(inputs['input_ids'][0])}")
65
+ print(f"📈 Logity: {logits.squeeze().tolist()}")
66
+ print("\n🔥 Top 5 predykcji:")
67
+ for idx in top5:
68
+ print(f" - {labels[idx]}: {probs[idx]:.3f}")
69
+
70
+ # === Główna funkcja
71
+ def main():
72
+ if len(sys.argv) < 2:
73
+ print("❌ Użycie: python scripts/12.2_predict_multilabel_file.py /ścieżka/do/pliku.Dockerfile [--debug]")
74
+ sys.exit(1)
75
+
76
+ filepath = Path(sys.argv[1])
77
+ debug = "--debug" in sys.argv
78
+
79
+ if not filepath.exists():
80
+ print(f"❌ Plik {filepath} nie istnieje.")
81
+ sys.exit(1)
82
+
83
+ labels = load_labels()
84
+ tokenizer, model = load_model_and_tokenizer()
85
+ predict(filepath, tokenizer, model, labels, debug=debug)
86
+
87
+ if __name__ == "__main__":
88
+ main()
scripts/13.2_threshold_calibration.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 13_threshold_calibration.py – dla multilabel v3
2
+
3
+ import json
4
+ import numpy as np
5
+ import torch
6
+ from datasets import load_from_disk
7
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer
8
+ from sklearn.metrics import f1_score
9
+ from pathlib import Path
10
+
11
+ # === Ścieżki v3
12
+ MODEL_DIR = Path("models/multilabel")
13
+ DATASET_DIR = Path("data/processed/dataset_multilabel_top30")
14
+ TOP_RULES_PATH = Path("data/metadata/top_rules.json")
15
+ OUTPUT_PATH = MODEL_DIR / "thresholds.json"
16
+
17
+ # === Wczytaj listę reguł
18
+ with open(TOP_RULES_PATH, encoding="utf-8") as f:
19
+ labels = json.load(f)
20
+ label_count = len(labels)
21
+
22
+ # === Model i tokenizer
23
+ model = AutoModelForSequenceClassification.from_pretrained(str(MODEL_DIR.resolve()))
24
+ tokenizer = AutoTokenizer.from_pretrained(str(MODEL_DIR.resolve()))
25
+
26
+ # === Trener z BCE loss
27
+ class MultilabelTrainer(Trainer):
28
+ def compute_loss(self, model, inputs, return_outputs=False):
29
+ labels = inputs.pop("labels")
30
+ outputs = model(**inputs)
31
+ logits = outputs.logits
32
+ loss_fct = torch.nn.BCEWithLogitsLoss()
33
+ loss = loss_fct(logits, labels.float())
34
+ return (loss, outputs) if return_outputs else loss
35
+
36
+ trainer = MultilabelTrainer(model=model)
37
+
38
+ # === Walidacja
39
+ ds = load_from_disk(str(DATASET_DIR.resolve()))
40
+ val_dataset = ds["validation"]
41
+
42
+ # === Predykcja
43
+ print("🔍 Generowanie predykcji na zbiorze walidacyjnym...")
44
+ predictions = trainer.predict(val_dataset)
45
+ logits = torch.tensor(predictions.predictions)
46
+ probs = torch.sigmoid(logits).numpy()
47
+ y_true = predictions.label_ids
48
+
49
+ # === Kalibracja
50
+ print("⚙️ Kalibracja progów dla każdej reguły...")
51
+ thresholds = {}
52
+ search_space = np.arange(0.05, 0.96, 0.05)
53
+
54
+ for i, label in enumerate(labels):
55
+ best_f1 = 0.0
56
+ best_thresh = 0.5
57
+ for t in search_space:
58
+ y_pred = (probs[:, i] > t).astype(int)
59
+ score = f1_score(y_true[:, i], y_pred, zero_division=0)
60
+ if score > best_f1:
61
+ best_f1 = score
62
+ best_thresh = round(t, 3)
63
+ thresholds[label] = best_thresh
64
+ print(f"📈 {label}: próg={best_thresh} (f1={best_f1:.4f})")
65
+
66
+ # === Zapis progów
67
+ OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
68
+ with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
69
+ json.dump(thresholds, f, indent=2)
70
+
71
+ print(f"\n✅ Zapisano progi do {OUTPUT_PATH}")
scripts/14.2_generate_rules_description.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 14_generate_static_descriptions.py
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ # === Top 30 reguł używanych w v4
7
+ top_rules = [
8
+ "DL4006", "DL3008", "SC2086", "DL3003", "DL3015", "DL3047", "DL3009", "DL3004", "DL4001", "DL4000",
9
+ "DL3059", "DL3018", "SC2016", "SC2046", "DL3006", "SC2028", "DL3027", "DL3020", "DL3025", "DL3042",
10
+ "DL3013", "DL3007", "DL3033", "SC2043", "DL3019", "DL3005", "DL3002", "DL3048", "DL3045", "DL3032"
11
+ ]
12
+
13
+ # === Opisy reguł — pobrane z dokumentacji hadolinta
14
+ descriptions = {
15
+ "DL4006": "Set the SHELL option -o pipefail before using RUN with a pipe.",
16
+ "DL3008": "Pin versions in apt-get install. Avoid floating dependencies.",
17
+ "SC2086": "Double quote to prevent globbing and word splitting.",
18
+ "DL3003": "Use WORKDIR to switch to a directory instead of RUN cd.",
19
+ "DL3015": "Avoid installing unnecessary packages to keep the image lean.",
20
+ "DL3047": "Do not use latest tag for the base image.",
21
+ "DL3009": "Delete the apt-get lists after installing packages.",
22
+ "DL3004": "Do not use sudo as it leads to unpredictable behavior in containers.",
23
+ "DL4001": "Either use ADD for local tar archives or COPY for everything else.",
24
+ "DL4000": "MAINTAINER is deprecated. Use LABEL instead.",
25
+ "DL3059": "Multiple consecutive RUN instructions should be combined.",
26
+ "DL3018": "Pin versions in apk add commands.",
27
+ "SC2016": "Expressions don't expand in single quotes. Use double quotes.",
28
+ "SC2046": "Quote this to prevent word splitting.",
29
+ "DL3006": "Always tag the version of the base image explicitly.",
30
+ "SC2028": "Quotes in echo may not behave as expected. Use printf instead.",
31
+ "DL3027": "Use only an allowed registry in the FROM image.",
32
+ "DL3020": "Use COPY instead of ADD for files and folders.",
33
+ "DL3025": "Use COPY instead of ADD unless you need ADD's features.",
34
+ "DL3042": "Avoid cache busting by rearranging ADD/RUN order properly.",
35
+ "DL3013": "Avoid installing unnecessary packages in your container.",
36
+ "DL3007": "Using yum install is discouraged. Prefer apk or apt.",
37
+ "DL3033": "Specify version with pip install to ensure reproducibility.",
38
+ "SC2043": "Use 'case' instead of many 'if' statements for simplicity.",
39
+ "DL3019": "Do not use ADD with URLs; use curl or wget instead.",
40
+ "DL3005": "Do not use apt-get upgrade or dist-upgrade.",
41
+ "DL3002": "Last USER should not be root.",
42
+ "DL3048": "Avoid using the ADD instruction; prefer COPY.",
43
+ "DL3045": "Do not specify the same label multiple times.",
44
+ "DL3032": "Do not use deprecated ADD syntax; use COPY."
45
+ }
46
+
47
+ # === Tworzenie struktury JSON
48
+ output_data = {}
49
+ for rule in top_rules:
50
+ output_data[rule] = {
51
+ "code": rule,
52
+ "title": descriptions.get(rule, "No title available."),
53
+ "description": descriptions.get(rule, "No description available."),
54
+ "documentation": ""
55
+ }
56
+
57
+ # === Zapis
58
+ output_path = Path("data/metadata/rules_descriptions_en.json")
59
+ output_path.parent.mkdir(parents=True, exist_ok=True)
60
+ with open(output_path, "w", encoding="utf-8") as f:
61
+ json.dump(output_data, f, indent=2)
62
+
63
+ print(f"✅ Zapisano {len(output_data)} reguł do {output_path}")