Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
improve security on upload-url route
Browse files- backend/pyproject.toml +1 -0
- backend/routes/upload.py +85 -10
backend/pyproject.toml
CHANGED
@@ -27,6 +27,7 @@ dependencies = [
|
|
27 |
"beautifulsoup4>=4.12.0",
|
28 |
"evaluate>=0.4.0",
|
29 |
"requests>=2.31.0",
|
|
|
30 |
]
|
31 |
|
32 |
[build-system]
|
|
|
27 |
"beautifulsoup4>=4.12.0",
|
28 |
"evaluate>=0.4.0",
|
29 |
"requests>=2.31.0",
|
30 |
+
"validators>=0.34.0",
|
31 |
]
|
32 |
|
33 |
[build-system]
|
backend/routes/upload.py
CHANGED
@@ -6,7 +6,11 @@ from bs4 import BeautifulSoup
|
|
6 |
from PyPDF2 import PdfReader
|
7 |
import requests
|
8 |
from fastapi import Form
|
9 |
-
from typing import Optional
|
|
|
|
|
|
|
|
|
10 |
|
11 |
router = APIRouter(tags=["files"])
|
12 |
|
@@ -20,6 +24,14 @@ os.makedirs(UPLOAD_ROOT, exist_ok=True)
|
|
20 |
# Minimum length for any file (in characters)
|
21 |
MIN_FILE_LENGTH = 500
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
def validate_pdf(file_path: str) -> bool:
|
24 |
"""Validate if file is a valid PDF."""
|
25 |
try:
|
@@ -218,16 +230,76 @@ async def upload_url(url: str = Form(...)):
|
|
218 |
Dictionary with status and session_id
|
219 |
"""
|
220 |
try:
|
221 |
-
#
|
222 |
-
|
223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
|
225 |
-
|
226 |
-
|
|
|
|
|
|
|
|
|
|
|
227 |
|
228 |
-
#
|
229 |
-
|
230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
|
232 |
# Extract the text
|
233 |
text = soup.get_text()
|
@@ -237,7 +309,10 @@ async def upload_url(url: str = Form(...)):
|
|
237 |
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
238 |
text = '\n'.join(chunk for chunk in chunks if chunk)
|
239 |
|
240 |
-
#
|
|
|
|
|
|
|
241 |
if len(text) > 25000:
|
242 |
text = text[:25000]
|
243 |
|
|
|
6 |
from PyPDF2 import PdfReader
|
7 |
import requests
|
8 |
from fastapi import Form
|
9 |
+
from typing import Optional, List
|
10 |
+
import re
|
11 |
+
from urllib.parse import urlparse
|
12 |
+
import html
|
13 |
+
import validators
|
14 |
|
15 |
router = APIRouter(tags=["files"])
|
16 |
|
|
|
24 |
# Minimum length for any file (in characters)
|
25 |
MIN_FILE_LENGTH = 500
|
26 |
|
27 |
+
# Configuration des limites de sécurité
|
28 |
+
MAX_CONTENT_SIZE = 5 * 1024 * 1024 # 5 MB max pour le contenu téléchargé
|
29 |
+
REQUEST_TIMEOUT = 10 # Timeout pour les requêtes HTTP
|
30 |
+
# Liste des domaines autorisés (vide = tous autorisés, mais à remplir en production)
|
31 |
+
ALLOWED_DOMAINS: List[str] = []
|
32 |
+
# Liste d'extensions de fichiers à bloquer dans les URLs
|
33 |
+
BLOCKED_EXTENSIONS = ['.exe', '.sh', '.bat', '.dll', '.jar', '.msi']
|
34 |
+
|
35 |
def validate_pdf(file_path: str) -> bool:
|
36 |
"""Validate if file is a valid PDF."""
|
37 |
try:
|
|
|
230 |
Dictionary with status and session_id
|
231 |
"""
|
232 |
try:
|
233 |
+
# Valider que l'URL est bien formée
|
234 |
+
if not validators.url(url):
|
235 |
+
raise HTTPException(status_code=400, detail="Invalid URL format")
|
236 |
+
|
237 |
+
# Vérifier si l'URL a une extension bloquée
|
238 |
+
parsed_url = urlparse(url)
|
239 |
+
path = parsed_url.path.lower()
|
240 |
+
if any(path.endswith(ext) for ext in BLOCKED_EXTENSIONS):
|
241 |
+
raise HTTPException(status_code=400, detail="This file type is not allowed")
|
242 |
+
|
243 |
+
# Vérifier si le domaine est autorisé (si la liste n'est pas vide)
|
244 |
+
domain = parsed_url.netloc
|
245 |
+
if ALLOWED_DOMAINS and domain not in ALLOWED_DOMAINS:
|
246 |
+
raise HTTPException(status_code=403, detail="This domain is not in the allowed list")
|
247 |
+
|
248 |
+
# Retrieve the content from the URL with proper headers to mimic a browser
|
249 |
+
headers = {
|
250 |
+
'User-Agent': 'Mozilla/5.0 (compatible; YourBenchBot/1.0; +https://yourbench.example.com)',
|
251 |
+
'Accept': 'text/html,application/xhtml+xml',
|
252 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
253 |
+
}
|
254 |
|
255 |
+
response = requests.get(
|
256 |
+
url,
|
257 |
+
timeout=REQUEST_TIMEOUT,
|
258 |
+
headers=headers,
|
259 |
+
stream=True # Pour vérifier la taille avant de télécharger tout le contenu
|
260 |
+
)
|
261 |
+
response.raise_for_status()
|
262 |
|
263 |
+
# Vérifier le Content-Type
|
264 |
+
content_type = response.headers.get('Content-Type', '')
|
265 |
+
if not content_type.startswith(('text/html', 'text/plain', 'application/xhtml+xml')):
|
266 |
+
raise HTTPException(
|
267 |
+
status_code=400,
|
268 |
+
detail=f"Unsupported content type: {content_type}. Only HTML and text formats are supported."
|
269 |
+
)
|
270 |
+
|
271 |
+
# Vérifier la taille du contenu
|
272 |
+
content_length = int(response.headers.get('Content-Length', 0))
|
273 |
+
if content_length > MAX_CONTENT_SIZE:
|
274 |
+
raise HTTPException(
|
275 |
+
status_code=400,
|
276 |
+
detail=f"Content too large ({content_length} bytes). Maximum size: {MAX_CONTENT_SIZE} bytes."
|
277 |
+
)
|
278 |
+
|
279 |
+
# Lire le contenu avec une limite de taille
|
280 |
+
content = ""
|
281 |
+
bytes_read = 0
|
282 |
+
for chunk in response.iter_content(chunk_size=8192, decode_unicode=True):
|
283 |
+
bytes_read += len(chunk.encode('utf-8') if isinstance(chunk, str) else chunk)
|
284 |
+
if bytes_read > MAX_CONTENT_SIZE:
|
285 |
+
raise HTTPException(
|
286 |
+
status_code=400,
|
287 |
+
detail=f"Content exceeded maximum allowed size of {MAX_CONTENT_SIZE} bytes"
|
288 |
+
)
|
289 |
+
content += chunk if isinstance(chunk, str) else chunk.decode('utf-8', errors='replace')
|
290 |
+
|
291 |
+
# Extract text from HTML with BeautifulSoup using the lxml parser for better security
|
292 |
+
soup = BeautifulSoup(content, 'html.parser')
|
293 |
+
|
294 |
+
# Remove potentially dangerous elements
|
295 |
+
for element in soup(['script', 'style', 'iframe', 'object', 'embed', 'noscript']):
|
296 |
+
element.extract()
|
297 |
+
|
298 |
+
# Remove on* attributes (event handlers) from all tags
|
299 |
+
for tag in soup.find_all(True):
|
300 |
+
for attr in list(tag.attrs):
|
301 |
+
if attr.startswith('on'):
|
302 |
+
del tag[attr]
|
303 |
|
304 |
# Extract the text
|
305 |
text = soup.get_text()
|
|
|
309 |
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
310 |
text = '\n'.join(chunk for chunk in chunks if chunk)
|
311 |
|
312 |
+
# Sanitize the text to prevent any potential stored XSS
|
313 |
+
text = html.escape(text)
|
314 |
+
|
315 |
+
# Limit to 25000 characters if necessary
|
316 |
if len(text) > 25000:
|
317 |
text = text[:25000]
|
318 |
|