File size: 7,784 Bytes
f957846
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a6aac2
 
 
f957846
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a6aac2
 
 
f957846
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a6aac2
 
f957846
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
"""
Secure regex utilities to prevent ReDoS (Regular Expression Denial of Service) attacks.

This module provides safe alternatives to common regex patterns that can cause
catastrophic backtracking and performance issues.
"""

import re
from typing import List, Optional


def safe_extract_numbers_with_seconds(text: str) -> List[float]:
    """
    Safely extract numbers before 'seconds' from text without ReDoS vulnerability.

    Args:
        text: The text to search for numbers followed by 'seconds'

    Returns:
        List of float numbers found before 'seconds'
    """
    if not text or not isinstance(text, str):
        return []

    # Use a more specific pattern that avoids catastrophic backtracking
    # Look for digits, optional decimal part, optional whitespace, then 'seconds'
    pattern = r"\b(\d+(?:\.\d+)?)\s*seconds\b"

    matches = re.findall(pattern, text)
    try:
        return [float(match) for match in matches]
    except (ValueError, TypeError):
        return []


def safe_extract_numbers(text: str) -> List[float]:
    """
    Safely extract all numbers from text without ReDoS vulnerability.

    Args:
        text: The text to extract numbers from

    Returns:
        List of float numbers found in the text
    """
    if not text or not isinstance(text, str):
        return []

    # Use a simple, safe pattern that doesn't cause backtracking
    # Match digits, optional decimal point and more digits
    pattern = r"\b\d+(?:\.\d+)?\b"

    matches = re.findall(pattern, text)
    try:
        return [float(match) for match in matches]
    except (ValueError, TypeError):
        return []


def safe_extract_page_number_from_filename(filename: str) -> Optional[int]:
    """
    Safely extract page number from filename ending with .png.

    Args:
        filename: The filename to extract page number from

    Returns:
        Page number if found, None otherwise
    """
    if not filename or not isinstance(filename, str):
        return None

    # Use a more specific, secure pattern that avoids potential ReDoS
    # Match 1-10 digits followed by .png at the end of string
    pattern = r"(\d{1,10})\.png$"
    match = re.search(pattern, filename)

    if match:
        try:
            return int(match.group(1))
        except (ValueError, TypeError):
            return None

    return None


def safe_extract_page_number_from_path(path: str) -> Optional[int]:
    """
    Safely extract page number from path containing _(\d+).png pattern.

    Args:
        path: The path to extract page number from

    Returns:
        Page number if found, None otherwise
    """
    if not path or not isinstance(path, str):
        return None

    # Use a more specific, secure pattern that avoids potential ReDoS
    # Match underscore followed by 1-10 digits and .png at the end
    pattern = r"_(\d{1,10})\.png$"
    match = re.search(pattern, path)

    if match:
        try:
            return int(match.group(1))
        except (ValueError, TypeError):
            return None

    return None


def safe_clean_text(text: str, remove_html: bool = True) -> str:
    """
    Safely clean text without ReDoS vulnerability.

    Args:
        text: The text to clean
        remove_html: Whether to remove HTML tags

    Returns:
        Cleaned text
    """
    if not text or not isinstance(text, str):
        return ""

    cleaned = text

    if remove_html:
        # Use a simple pattern that doesn't cause backtracking
        cleaned = re.sub(r"<[^>]*>", "", cleaned)

    # Clean up whitespace
    cleaned = re.sub(r"\s+", " ", cleaned).strip()

    return cleaned


def safe_extract_rgb_values(text: str) -> Optional[tuple]:
    """
    Safely extract RGB values from text like "(255, 255, 255)".

    Args:
        text: The text to extract RGB values from

    Returns:
        Tuple of (r, g, b) values if found, None otherwise
    """
    if not text or not isinstance(text, str):
        return None

    # Use a simple, safe pattern
    pattern = r"\(\s*(\d{1,3})\s*,\s*(\d{1,3})\s*,\s*(\d{1,3})\s*\)"
    match = re.match(pattern, text.strip())

    if match:
        try:
            r = int(match.group(1))
            g = int(match.group(2))
            b = int(match.group(3))

            # Validate RGB values
            if 0 <= r <= 255 and 0 <= g <= 255 and 0 <= b <= 255:
                return (r, g, b)
        except (ValueError, TypeError):
            pass

    return None


def safe_split_filename(filename: str, delimiters: List[str]) -> List[str]:
    """
    Safely split filename by delimiters without ReDoS vulnerability.

    Args:
        filename: The filename to split
        delimiters: List of delimiter patterns to split on

    Returns:
        List of filename parts
    """
    if not filename or not isinstance(filename, str):
        return []

    if not delimiters:
        return [filename]

    # Escape special regex characters in delimiters
    escaped_delimiters = [re.escape(delim) for delim in delimiters]

    # Create a safe pattern
    pattern = "|".join(escaped_delimiters)

    try:
        return re.split(pattern, filename)
    except re.error:
        # Fallback to simple string operations if regex fails
        result = [filename]
        for delim in delimiters:
            new_result = []
            for part in result:
                new_result.extend(part.split(delim))
            result = new_result
        return result


def safe_remove_leading_newlines(text: str) -> str:
    """
    Safely remove leading newlines without ReDoS vulnerability.

    Args:
        text: The text to clean

    Returns:
        Text with leading newlines removed
    """
    if not text or not isinstance(text, str):
        return ""

    # Use a simple pattern
    return re.sub(r"^\n+", "", text).strip()


def safe_remove_non_ascii(text: str) -> str:
    """
    Safely remove non-ASCII characters without ReDoS vulnerability.

    Args:
        text: The text to clean

    Returns:
        Text with non-ASCII characters removed
    """
    if not text or not isinstance(text, str):
        return ""

    # Use a simple pattern
    return re.sub(r"[^\x00-\x7F]", "", text)


def safe_extract_latest_number_from_filename(filename: str) -> Optional[int]:
    """
    Safely extract the latest/largest number from filename without ReDoS vulnerability.

    Args:
        filename: The filename to extract number from

    Returns:
        The largest number found, or None if no numbers found
    """
    if not filename or not isinstance(filename, str):
        return None

    # Use a safe pattern to find all numbers (limit to reasonable length)
    pattern = r"\d{1,10}"
    matches = re.findall(pattern, filename)

    if not matches:
        return None

    try:
        # Convert to integers and return the maximum
        numbers = [int(match) for match in matches]
        return max(numbers)
    except (ValueError, TypeError):
        return None


def safe_sanitize_text(text: str, replacement: str = "_") -> str:
    """
    Safely sanitize text by removing dangerous characters without ReDoS vulnerability.

    Args:
        text: The text to sanitize
        replacement: Character to replace dangerous characters with

    Returns:
        Sanitized text
    """
    if not text or not isinstance(text, str):
        return ""

    # Use a simple pattern for dangerous characters
    dangerous_chars = r'[<>:"|?*\\/\x00-\x1f\x7f-\x9f]'
    sanitized = re.sub(dangerous_chars, replacement, text)

    # Remove multiple consecutive replacements
    sanitized = re.sub(f"{re.escape(replacement)}+", replacement, sanitized)

    # Remove leading/trailing replacements
    sanitized = sanitized.strip(replacement)

    return sanitized