Improved paddle and hybrid OCR analysis across all options. Tried to revise requirements for spaces
2c00d05
| """ | |
| Secure regex utilities to prevent ReDoS (Regular Expression Denial of Service) attacks. | |
| This module provides safe alternatives to common regex patterns that can cause | |
| catastrophic backtracking and performance issues. | |
| """ | |
| import re | |
| from typing import List, Optional | |
| def safe_extract_numbers_with_seconds(text: str) -> List[float]: | |
| """ | |
| Safely extract numbers before 'seconds' from text without ReDoS vulnerability. | |
| Args: | |
| text: The text to search for numbers followed by 'seconds' | |
| Returns: | |
| List of float numbers found before 'seconds' | |
| """ | |
| if not text or not isinstance(text, str): | |
| return [] | |
| # Use a more specific pattern that avoids catastrophic backtracking | |
| # Look for digits, optional decimal part, optional whitespace, then 'seconds' | |
| pattern = r"\b(\d+(?:\.\d+)?)\s*seconds\b" | |
| matches = re.findall(pattern, text) | |
| try: | |
| return [float(match) for match in matches] | |
| except (ValueError, TypeError): | |
| return [] | |
| def safe_extract_numbers(text: str) -> List[float]: | |
| """ | |
| Safely extract all numbers from text without ReDoS vulnerability. | |
| Args: | |
| text: The text to extract numbers from | |
| Returns: | |
| List of float numbers found in the text | |
| """ | |
| if not text or not isinstance(text, str): | |
| return [] | |
| # Use a simple, safe pattern that doesn't cause backtracking | |
| # Match digits, optional decimal point and more digits | |
| pattern = r"\b\d+(?:\.\d+)?\b" | |
| matches = re.findall(pattern, text) | |
| try: | |
| return [float(match) for match in matches] | |
| except (ValueError, TypeError): | |
| return [] | |
| def safe_extract_page_number_from_filename(filename: str) -> Optional[int]: | |
| """ | |
| Safely extract page number from filename ending with .png. | |
| Args: | |
| filename: The filename to extract page number from | |
| Returns: | |
| Page number if found, None otherwise | |
| """ | |
| if not filename or not isinstance(filename, str): | |
| return None | |
| # Use a more specific, secure pattern that avoids potential ReDoS | |
| # Match 1-10 digits followed by .png at the end of string | |
| pattern = r"(\d{1,10})\.png$" | |
| match = re.search(pattern, filename) | |
| if match: | |
| try: | |
| return int(match.group(1)) | |
| except (ValueError, TypeError): | |
| return None | |
| return None | |
| def safe_extract_page_number_from_path(path: str) -> Optional[int]: | |
| """ | |
| Safely extract page number from path containing _(\\d+).png pattern. | |
| Args: | |
| path: The path to extract page number from | |
| Returns: | |
| Page number if found, None otherwise | |
| """ | |
| if not path or not isinstance(path, str): | |
| return None | |
| # Use a more specific, secure pattern that avoids potential ReDoS | |
| # Match underscore followed by 1-10 digits and .png at the end | |
| pattern = r"_(\d{1,10})\.png$" | |
| match = re.search(pattern, path) | |
| if match: | |
| try: | |
| return int(match.group(1)) | |
| except (ValueError, TypeError): | |
| return None | |
| return None | |
| def safe_clean_text(text: str, remove_html: bool = True) -> str: | |
| """ | |
| Safely clean text without ReDoS vulnerability. | |
| Args: | |
| text: The text to clean | |
| remove_html: Whether to remove HTML tags | |
| Returns: | |
| Cleaned text | |
| """ | |
| if not text or not isinstance(text, str): | |
| return "" | |
| cleaned = text | |
| if remove_html: | |
| # Use a simple pattern that doesn't cause backtracking | |
| cleaned = re.sub(r"<[^>]*>", "", cleaned) | |
| # Clean up whitespace | |
| cleaned = re.sub(r"\s+", " ", cleaned).strip() | |
| return cleaned | |
| def safe_extract_rgb_values(text: str) -> Optional[tuple]: | |
| """ | |
| Safely extract RGB values from text like "(255, 255, 255)". | |
| Args: | |
| text: The text to extract RGB values from | |
| Returns: | |
| Tuple of (r, g, b) values if found, None otherwise | |
| """ | |
| if not text or not isinstance(text, str): | |
| return None | |
| # Use a simple, safe pattern | |
| pattern = r"\(\s*(\d{1,3})\s*,\s*(\d{1,3})\s*,\s*(\d{1,3})\s*\)" | |
| match = re.match(pattern, text.strip()) | |
| if match: | |
| try: | |
| r = int(match.group(1)) | |
| g = int(match.group(2)) | |
| b = int(match.group(3)) | |
| # Validate RGB values | |
| if 0 <= r <= 255 and 0 <= g <= 255 and 0 <= b <= 255: | |
| return (r, g, b) | |
| except (ValueError, TypeError): | |
| pass | |
| return None | |
| def safe_split_filename(filename: str, delimiters: List[str]) -> List[str]: | |
| """ | |
| Safely split filename by delimiters without ReDoS vulnerability. | |
| Args: | |
| filename: The filename to split | |
| delimiters: List of delimiter patterns to split on | |
| Returns: | |
| List of filename parts | |
| """ | |
| if not filename or not isinstance(filename, str): | |
| return [] | |
| if not delimiters: | |
| return [filename] | |
| # Escape special regex characters in delimiters | |
| escaped_delimiters = [re.escape(delim) for delim in delimiters] | |
| # Create a safe pattern | |
| pattern = "|".join(escaped_delimiters) | |
| try: | |
| return re.split(pattern, filename) | |
| except re.error: | |
| # Fallback to simple string operations if regex fails | |
| result = [filename] | |
| for delim in delimiters: | |
| new_result = [] | |
| for part in result: | |
| new_result.extend(part.split(delim)) | |
| result = new_result | |
| return result | |
| def safe_remove_leading_newlines(text: str) -> str: | |
| """ | |
| Safely remove leading newlines without ReDoS vulnerability. | |
| Args: | |
| text: The text to clean | |
| Returns: | |
| Text with leading newlines removed | |
| """ | |
| if not text or not isinstance(text, str): | |
| return "" | |
| # Use a simple pattern | |
| return re.sub(r"^\n+", "", text).strip() | |
| def safe_remove_non_ascii(text: str) -> str: | |
| """ | |
| Safely remove non-ASCII characters without ReDoS vulnerability. | |
| Args: | |
| text: The text to clean | |
| Returns: | |
| Text with non-ASCII characters removed | |
| """ | |
| if not text or not isinstance(text, str): | |
| return "" | |
| # Use a simple pattern | |
| return re.sub(r"[^\x00-\x7F]", "", text) | |
| def safe_extract_latest_number_from_filename(filename: str) -> Optional[int]: | |
| """ | |
| Safely extract the latest/largest number from filename without ReDoS vulnerability. | |
| Args: | |
| filename: The filename to extract number from | |
| Returns: | |
| The largest number found, or None if no numbers found | |
| """ | |
| if not filename or not isinstance(filename, str): | |
| return None | |
| # Use a safe pattern to find all numbers (limit to reasonable length) | |
| pattern = r"\d{1,10}" | |
| matches = re.findall(pattern, filename) | |
| if not matches: | |
| return None | |
| try: | |
| # Convert to integers and return the maximum | |
| numbers = [int(match) for match in matches] | |
| return max(numbers) | |
| except (ValueError, TypeError): | |
| return None | |
| def safe_sanitize_text(text: str, replacement: str = "_", max_length: int = 255) -> str: | |
| """ | |
| Safely sanitize text by removing dangerous characters without ReDoS vulnerability. | |
| Args: | |
| text: The text to sanitize | |
| replacement: Character to replace dangerous characters with | |
| max_length: Maximum length of the text | |
| Returns: | |
| Sanitized text | |
| """ | |
| if not text or not isinstance(text, str): | |
| return "" | |
| # Use a simple pattern for dangerous characters | |
| dangerous_chars = r'[<>:"|?*\\/\x00-\x1f\x7f-\x9f]' | |
| sanitized = re.sub(dangerous_chars, replacement, text) | |
| # Remove multiple consecutive replacements | |
| sanitized = re.sub(f"{re.escape(replacement)}+", replacement, sanitized) | |
| # Remove leading/trailing replacements | |
| sanitized = sanitized.strip(replacement) | |
| # Truncate to maximum length | |
| sanitized = sanitized[:max_length] | |
| return sanitized | |