document_redaction / tools /secure_path_utils.py
seanpedrickcase's picture
Further fixes on uncontrolled path issue
5345e1f
raw
history blame
8.02 kB
"""
Secure path utilities to prevent path injection attacks.
This module provides secure alternatives to os.path operations that validate
and sanitize file paths to prevent directory traversal and other path-based attacks.
"""
import logging
import os
import re
from pathlib import Path
from typing import Optional, Union
logger = logging.getLogger(__name__)
def sanitize_filename(filename: str, max_length: int = 255) -> str:
"""
Sanitize a filename to prevent path injection attacks.
Args:
filename: The filename to sanitize
max_length: Maximum length of the sanitized filename
Returns:
A sanitized filename safe for use in file operations
Raises:
ValueError: If the filename cannot be sanitized safely
"""
if not filename or not isinstance(filename, str):
raise ValueError("Filename must be a non-empty string")
# Remove any path separators and normalize
filename = os.path.basename(filename)
# Remove or replace dangerous characters
# Keep alphanumeric, dots, hyphens, underscores, spaces, parentheses, brackets, and other safe chars
# Only remove truly dangerous characters like path separators and control chars
sanitized = re.sub(r'[<>:"|?*\x00-\x1f]', "_", filename)
# Remove multiple consecutive dots (except for file extensions)
sanitized = re.sub(r"\.{2,}", ".", sanitized)
# Remove leading/trailing dots and spaces
sanitized = sanitized.strip(". ")
# Ensure it's not empty after sanitization
if not sanitized:
sanitized = "sanitized_file"
# Truncate if too long, preserving extension
if len(sanitized) > max_length:
name, ext = os.path.splitext(sanitized)
max_name_length = max_length - len(ext)
sanitized = name[:max_name_length] + ext
return sanitized
def secure_path_join(base_path: Union[str, Path], *path_parts: str) -> Path:
"""
Safely join paths while preventing directory traversal attacks.
Args:
base_path: The base directory path
*path_parts: Additional path components to join
Returns:
A Path object representing the safe joined path
Raises:
ValueError: If any path component contains dangerous characters
PermissionError: If the resulting path would escape the base directory
"""
base_path = Path(base_path).resolve()
# Sanitize each path part - only sanitize if it contains dangerous patterns
sanitized_parts = []
for part in path_parts:
if not part:
continue
# Only sanitize if the part contains dangerous patterns
if re.search(r'[<>:"|?*\x00-\x1f]|\.{2,}', part):
sanitized_part = sanitize_filename(part)
else:
sanitized_part = part
sanitized_parts.append(sanitized_part)
# Join the paths
result_path = base_path
for part in sanitized_parts:
result_path = result_path / part
# Resolve the final path
result_path = result_path.resolve()
# Security check: ensure the result is within the base directory
try:
result_path.relative_to(base_path)
except ValueError:
raise PermissionError(f"Path would escape base directory: {result_path}")
return result_path
def secure_file_write(
base_path: Union[str, Path],
filename: str,
content: str,
mode: str = "w",
encoding: Optional[str] = None,
**kwargs,
) -> None:
"""
Safely write content to a file within a base directory with path validation.
Args:
base_path: The base directory under which to write the file
filename: The target file name or relative path (untrusted)
content: The content to write
mode: File open mode (default: 'w')
encoding: Text encoding (default: None for binary mode)
**kwargs: Additional arguments for open()
"""
# Use secure_path_join to ensure the final path is within base_path and to sanitize filename
file_path = secure_path_join(base_path, filename)
# Ensure the parent directory exists AFTER joining and securing the final path
file_path.parent.mkdir(parents=True, exist_ok=True)
# Write the file
open_kwargs = {"mode": mode}
if encoding:
open_kwargs["encoding"] = encoding
open_kwargs.update(kwargs)
with open(file_path, **open_kwargs) as f:
f.write(content)
def secure_file_read(
base_path: Union[str, Path],
filename: str,
mode: str = "r",
encoding: Optional[str] = None,
**kwargs,
) -> str:
"""
Safely read content from a file within a base directory with path validation.
Args:
base_path: The base directory under which to read the file
filename: The target file name or relative path (untrusted)
mode: File open mode (default: 'r')
encoding: Text encoding (default: None for binary mode)
**kwargs: Additional arguments for open()
Returns:
The file content
"""
# Use secure_path_join to ensure the final path is within base_path and to sanitize filename
file_path = secure_path_join(base_path, filename)
# Validate the path exists and is a file
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
if not file_path.is_file():
raise ValueError(f"Path is not a file: {file_path}")
# Read the file
open_kwargs = {"mode": mode}
if encoding:
open_kwargs["encoding"] = encoding
open_kwargs.update(kwargs)
with open(file_path, **open_kwargs) as f:
return f.read()
def validate_path_safety(
path: Union[str, Path], base_path: Optional[Union[str, Path]] = None
) -> bool:
"""
Validate that a path is safe and doesn't contain dangerous patterns.
Args:
path: The path to validate
base_path: Optional base path to check against
Returns:
True if the path is safe, False otherwise
"""
try:
path = Path(path)
# Check for dangerous patterns
path_str = str(path)
# Check for directory traversal patterns
dangerous_patterns = [
"..", # Parent directory
"//", # Double slashes
"\\", # Backslashes (on Unix systems)
]
for pattern in dangerous_patterns:
if pattern in path_str:
return False
# If base path is provided, ensure the path is within it
if base_path:
base_path = Path(base_path).resolve()
path = path.resolve()
try:
path.relative_to(base_path)
except ValueError:
return False
return True
except Exception:
return False
# Backward compatibility functions that maintain the same interface as os.path
def secure_join(*paths: str) -> str:
"""
Secure alternative to os.path.join that prevents path injection.
Args:
*paths: Path components to join
Returns:
A safe joined path string
"""
if not paths:
return ""
# Use the first path as base, others as components
base_path = Path(paths[0])
path_parts = paths[1:]
# Only use secure_path_join if there are potentially dangerous patterns
if any(re.search(r'[<>:"|?*\x00-\x1f]|\.{2,}', part) for part in path_parts):
result_path = secure_path_join(base_path, *path_parts)
return str(result_path)
else:
# Use normal path joining for safe paths
return str(Path(*paths))
def secure_basename(path: str) -> str:
"""
Secure alternative to os.path.basename that sanitizes the result.
Args:
path: The path to get the basename from
Returns:
A sanitized basename
"""
basename = os.path.basename(path)
# Only sanitize if the basename contains dangerous patterns
if re.search(r'[<>:"|?*\x00-\x1f]|\.{2,}', basename):
return sanitize_filename(basename)
else:
return basename