Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

document_redaction / tools /secure_regex_utils.py

seanpedrickcase

Improved paddle and hybrid OCR analysis across all options. Tried to revise requirements for spaces

2c00d05 17 days ago

raw

history blame

7.93 kB

	"""
	Secure regex utilities to prevent ReDoS (Regular Expression Denial of Service) attacks.

	This module provides safe alternatives to common regex patterns that can cause
	catastrophic backtracking and performance issues.
	"""

	import re
	from typing import List, Optional


	def safe_extract_numbers_with_seconds(text: str) -> List[float]:
	"""
	Safely extract numbers before 'seconds' from text without ReDoS vulnerability.

	Args:
	text: The text to search for numbers followed by 'seconds'

	Returns:
	List of float numbers found before 'seconds'
	"""
	if not text or not isinstance(text, str):
	return []

	# Use a more specific pattern that avoids catastrophic backtracking
	# Look for digits, optional decimal part, optional whitespace, then 'seconds'
	pattern = r"\b(\d+(?:\.\d+)?)\s*seconds\b"

	matches = re.findall(pattern, text)
	try:
	return [float(match) for match in matches]
	except (ValueError, TypeError):
	return []


	def safe_extract_numbers(text: str) -> List[float]:
	"""
	Safely extract all numbers from text without ReDoS vulnerability.

	Args:
	text: The text to extract numbers from

	Returns:
	List of float numbers found in the text
	"""
	if not text or not isinstance(text, str):
	return []

	# Use a simple, safe pattern that doesn't cause backtracking
	# Match digits, optional decimal point and more digits
	pattern = r"\b\d+(?:\.\d+)?\b"

	matches = re.findall(pattern, text)
	try:
	return [float(match) for match in matches]
	except (ValueError, TypeError):
	return []


	def safe_extract_page_number_from_filename(filename: str) -> Optional[int]:
	"""
	Safely extract page number from filename ending with .png.

	Args:
	filename: The filename to extract page number from

	Returns:
	Page number if found, None otherwise
	"""
	if not filename or not isinstance(filename, str):
	return None

	# Use a more specific, secure pattern that avoids potential ReDoS
	# Match 1-10 digits followed by .png at the end of string
	pattern = r"(\d{1,10})\.png$"
	match = re.search(pattern, filename)

	if match:
	try:
	return int(match.group(1))
	except (ValueError, TypeError):
	return None

	return None


	def safe_extract_page_number_from_path(path: str) -> Optional[int]:
	"""
	Safely extract page number from path containing _(\\d+).png pattern.

	Args:
	path: The path to extract page number from

	Returns:
	Page number if found, None otherwise
	"""
	if not path or not isinstance(path, str):
	return None

	# Use a more specific, secure pattern that avoids potential ReDoS
	# Match underscore followed by 1-10 digits and .png at the end
	pattern = r"_(\d{1,10})\.png$"
	match = re.search(pattern, path)

	if match:
	try:
	return int(match.group(1))
	except (ValueError, TypeError):
	return None

	return None


	def safe_clean_text(text: str, remove_html: bool = True) -> str:
	"""
	Safely clean text without ReDoS vulnerability.

	Args:
	text: The text to clean
	remove_html: Whether to remove HTML tags

	Returns:
	Cleaned text
	"""
	if not text or not isinstance(text, str):
	return ""

	cleaned = text

	if remove_html:
	# Use a simple pattern that doesn't cause backtracking
	cleaned = re.sub(r"<[^>]*>", "", cleaned)

	# Clean up whitespace
	cleaned = re.sub(r"\s+", " ", cleaned).strip()

	return cleaned


	def safe_extract_rgb_values(text: str) -> Optional[tuple]:
	"""
	Safely extract RGB values from text like "(255, 255, 255)".

	Args:
	text: The text to extract RGB values from

	Returns:
	Tuple of (r, g, b) values if found, None otherwise
	"""
	if not text or not isinstance(text, str):
	return None

	# Use a simple, safe pattern
	pattern = r"$\s(\d{1,3})\s,\s(\d{1,3})\s,\s(\d{1,3})\s$"
	match = re.match(pattern, text.strip())

	if match:
	try:
	r = int(match.group(1))
	g = int(match.group(2))
	b = int(match.group(3))

	# Validate RGB values
	if 0 <= r <= 255 and 0 <= g <= 255 and 0 <= b <= 255:
	return (r, g, b)
	except (ValueError, TypeError):
	pass

	return None


	def safe_split_filename(filename: str, delimiters: List[str]) -> List[str]:
	"""
	Safely split filename by delimiters without ReDoS vulnerability.

	Args:
	filename: The filename to split
	delimiters: List of delimiter patterns to split on

	Returns:
	List of filename parts
	"""
	if not filename or not isinstance(filename, str):
	return []

	if not delimiters:
	return [filename]

	# Escape special regex characters in delimiters
	escaped_delimiters = [re.escape(delim) for delim in delimiters]

	# Create a safe pattern
	pattern = "\|".join(escaped_delimiters)

	try:
	return re.split(pattern, filename)
	except re.error:
	# Fallback to simple string operations if regex fails
	result = [filename]
	for delim in delimiters:
	new_result = []
	for part in result:
	new_result.extend(part.split(delim))
	result = new_result
	return result


	def safe_remove_leading_newlines(text: str) -> str:
	"""
	Safely remove leading newlines without ReDoS vulnerability.

	Args:
	text: The text to clean

	Returns:
	Text with leading newlines removed
	"""
	if not text or not isinstance(text, str):
	return ""

	# Use a simple pattern
	return re.sub(r"^\n+", "", text).strip()


	def safe_remove_non_ascii(text: str) -> str:
	"""
	Safely remove non-ASCII characters without ReDoS vulnerability.

	Args:
	text: The text to clean

	Returns:
	Text with non-ASCII characters removed
	"""
	if not text or not isinstance(text, str):
	return ""

	# Use a simple pattern
	return re.sub(r"[^\x00-\x7F]", "", text)


	def safe_extract_latest_number_from_filename(filename: str) -> Optional[int]:
	"""
	Safely extract the latest/largest number from filename without ReDoS vulnerability.

	Args:
	filename: The filename to extract number from

	Returns:
	The largest number found, or None if no numbers found
	"""
	if not filename or not isinstance(filename, str):
	return None

	# Use a safe pattern to find all numbers (limit to reasonable length)
	pattern = r"\d{1,10}"
	matches = re.findall(pattern, filename)

	if not matches:
	return None

	try:
	# Convert to integers and return the maximum
	numbers = [int(match) for match in matches]
	return max(numbers)
	except (ValueError, TypeError):
	return None


	def safe_sanitize_text(text: str, replacement: str = "_", max_length: int = 255) -> str:
	"""
	Safely sanitize text by removing dangerous characters without ReDoS vulnerability.

	Args:
	text: The text to sanitize
	replacement: Character to replace dangerous characters with
	max_length: Maximum length of the text
	Returns:
	Sanitized text
	"""
	if not text or not isinstance(text, str):
	return ""

	# Use a simple pattern for dangerous characters
	dangerous_chars = r'[<>:"\|?*\\/\x00-\x1f\x7f-\x9f]'
	sanitized = re.sub(dangerous_chars, replacement, text)

	# Remove multiple consecutive replacements
	sanitized = re.sub(f"{re.escape(replacement)}+", replacement, sanitized)

	# Remove leading/trailing replacements
	sanitized = sanitized.strip(replacement)

	# Truncate to maximum length
	sanitized = sanitized[:max_length]

	return sanitized