Spaces:
Running
Running
Update core/file_scanner.py
Browse files- core/file_scanner.py +65 -42
core/file_scanner.py
CHANGED
|
@@ -1,5 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from pathlib import Path
|
| 2 |
-
from typing import List,
|
| 3 |
from dataclasses import dataclass
|
| 4 |
|
| 5 |
@dataclass
|
|
@@ -9,66 +12,86 @@ class FileInfo:
|
|
| 9 |
extension: str
|
| 10 |
content: Optional[str] = None
|
| 11 |
encoding: Optional[str] = None
|
| 12 |
-
|
| 13 |
@property
|
| 14 |
def formatted_size(self) -> str:
|
|
|
|
| 15 |
if self.size < 1024:
|
| 16 |
return f"{self.size} B"
|
| 17 |
elif self.size < 1024 * 1024:
|
| 18 |
-
return f"{self.size/1024:.1f} KB"
|
| 19 |
else:
|
| 20 |
-
return f"{self.size/(1024*1024):.1f} MB"
|
|
|
|
| 21 |
|
| 22 |
class FileScanner:
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
'.go', '.rs', '.php', '.rb', '.ts', '.scala', '.kt',
|
| 27 |
-
'.cs', '.swift', '.m', '.sh', '.pl', '.r'
|
| 28 |
-
}
|
| 29 |
-
|
| 30 |
-
# スキャン対象から除外するディレクトリ
|
| 31 |
EXCLUDED_DIRS = {
|
| 32 |
-
'.git', '__pycache__', 'node_modules', 'venv',
|
| 33 |
-
'build', 'dist', 'target', 'bin', 'obj'
|
| 34 |
}
|
| 35 |
|
| 36 |
-
def __init__(self, base_dir: Path):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
self.base_dir = base_dir
|
| 38 |
-
|
|
|
|
|
|
|
| 39 |
def _should_scan_file(self, path: Path) -> bool:
|
|
|
|
|
|
|
| 40 |
if any(excluded in path.parts for excluded in self.EXCLUDED_DIRS):
|
| 41 |
return False
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
try:
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
try:
|
| 48 |
-
with file_path.open('r', encoding=
|
| 49 |
-
return f.read()
|
| 50 |
except UnicodeDecodeError:
|
| 51 |
-
#
|
| 52 |
with file_path.open('r', encoding='cp932') as f:
|
| 53 |
-
return f.read()
|
| 54 |
-
except
|
| 55 |
-
return None
|
| 56 |
-
|
| 57 |
def scan_files(self) -> List[FileInfo]:
|
|
|
|
|
|
|
|
|
|
| 58 |
if not self.base_dir.exists():
|
| 59 |
-
raise FileNotFoundError(f"
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
for entry in self.base_dir.rglob('*'):
|
| 64 |
if entry.is_file() and self._should_scan_file(entry):
|
| 65 |
-
content = self._read_file_content(entry)
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
| 1 |
+
# core/file_scanner.py
|
| 2 |
+
|
| 3 |
+
import chardet
|
| 4 |
from pathlib import Path
|
| 5 |
+
from typing import List, Optional, Set
|
| 6 |
from dataclasses import dataclass
|
| 7 |
|
| 8 |
@dataclass
|
|
|
|
| 12 |
extension: str
|
| 13 |
content: Optional[str] = None
|
| 14 |
encoding: Optional[str] = None
|
| 15 |
+
|
| 16 |
@property
|
| 17 |
def formatted_size(self) -> str:
|
| 18 |
+
"""ファイルサイズを見やすい単位で表示"""
|
| 19 |
if self.size < 1024:
|
| 20 |
return f"{self.size} B"
|
| 21 |
elif self.size < 1024 * 1024:
|
| 22 |
+
return f"{self.size / 1024:.1f} KB"
|
| 23 |
else:
|
| 24 |
+
return f"{self.size / (1024 * 1024):.1f} MB"
|
| 25 |
+
|
| 26 |
|
| 27 |
class FileScanner:
|
| 28 |
+
"""
|
| 29 |
+
指定された拡張子のファイルだけを再帰的に検索し、ファイル内容を読み込むクラス。
|
| 30 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
EXCLUDED_DIRS = {
|
| 32 |
+
'.git', '__pycache__', 'node_modules', 'venv',
|
| 33 |
+
'.env', 'build', 'dist', 'target', 'bin', 'obj'
|
| 34 |
}
|
| 35 |
|
| 36 |
+
def __init__(self, base_dir: Path, target_extensions: Set[str]):
|
| 37 |
+
"""
|
| 38 |
+
base_dir: 解析を開始するディレクトリ(Path)
|
| 39 |
+
target_extensions: 対象とする拡張子の集合 (例: {'.py', '.js', '.md'})
|
| 40 |
+
"""
|
| 41 |
self.base_dir = base_dir
|
| 42 |
+
# 大文字・小文字のブレを吸収するために小文字化して保持
|
| 43 |
+
self.target_extensions = {ext.lower() for ext in target_extensions}
|
| 44 |
+
|
| 45 |
def _should_scan_file(self, path: Path) -> bool:
|
| 46 |
+
"""対象外フォルダ・拡張子を除外"""
|
| 47 |
+
# 除外フォルダ判定
|
| 48 |
if any(excluded in path.parts for excluded in self.EXCLUDED_DIRS):
|
| 49 |
return False
|
| 50 |
+
# 拡張子チェック
|
| 51 |
+
if path.suffix.lower() in self.target_extensions:
|
| 52 |
+
return True
|
| 53 |
+
return False
|
| 54 |
+
|
| 55 |
+
def _read_file_content(self, file_path: Path) -> (Optional[str], Optional[str]):
|
| 56 |
+
"""
|
| 57 |
+
ファイル内容を読み込み、エンコーディングを判定して返す。
|
| 58 |
+
先頭4096バイトをchardetで解析し、失敗時はcp932も試す。
|
| 59 |
+
"""
|
| 60 |
try:
|
| 61 |
+
with file_path.open('rb') as rb:
|
| 62 |
+
raw_data = rb.read(4096)
|
| 63 |
+
detect_result = chardet.detect(raw_data)
|
| 64 |
+
encoding = detect_result['encoding'] if detect_result['confidence'] > 0.7 else 'utf-8'
|
| 65 |
+
|
| 66 |
+
# 推定エンコーディングで読み込み
|
| 67 |
try:
|
| 68 |
+
with file_path.open('r', encoding=encoding) as f:
|
| 69 |
+
return f.read(), encoding
|
| 70 |
except UnicodeDecodeError:
|
| 71 |
+
# cp932 を再試行 (Windows向け)
|
| 72 |
with file_path.open('r', encoding='cp932') as f:
|
| 73 |
+
return f.read(), 'cp932'
|
| 74 |
+
except Exception:
|
| 75 |
+
return None, None
|
| 76 |
+
|
| 77 |
def scan_files(self) -> List[FileInfo]:
|
| 78 |
+
"""
|
| 79 |
+
再帰的にファイルを探して、指定拡張子だけをFileInfoオブジェクトのリストとして返す。
|
| 80 |
+
"""
|
| 81 |
if not self.base_dir.exists():
|
| 82 |
+
raise FileNotFoundError(f"指定ディレクトリが見つかりません: {self.base_dir}")
|
| 83 |
+
|
| 84 |
+
collected_files = []
|
| 85 |
+
for entry in self.base_dir.glob("**/*"):
|
|
|
|
| 86 |
if entry.is_file() and self._should_scan_file(entry):
|
| 87 |
+
content, encoding = self._read_file_content(entry)
|
| 88 |
+
file_info = FileInfo(
|
| 89 |
+
path=entry.resolve(),
|
| 90 |
+
size=entry.stat().st_size,
|
| 91 |
+
extension=entry.suffix.lower(),
|
| 92 |
+
content=content,
|
| 93 |
+
encoding=encoding
|
| 94 |
+
)
|
| 95 |
+
collected_files.append(file_info)
|
| 96 |
+
# path の文字列表現でソート
|
| 97 |
+
return sorted(collected_files, key=lambda x: str(x.path))
|