tfrere commited on
Commit
b1e6db8
·
1 Parent(s): ab84d6a

improve security on upload-url route

Browse files
Files changed (2) hide show
  1. backend/pyproject.toml +1 -0
  2. backend/routes/upload.py +85 -10
backend/pyproject.toml CHANGED
@@ -27,6 +27,7 @@ dependencies = [
27
  "beautifulsoup4>=4.12.0",
28
  "evaluate>=0.4.0",
29
  "requests>=2.31.0",
 
30
  ]
31
 
32
  [build-system]
 
27
  "beautifulsoup4>=4.12.0",
28
  "evaluate>=0.4.0",
29
  "requests>=2.31.0",
30
+ "validators>=0.34.0",
31
  ]
32
 
33
  [build-system]
backend/routes/upload.py CHANGED
@@ -6,7 +6,11 @@ from bs4 import BeautifulSoup
6
  from PyPDF2 import PdfReader
7
  import requests
8
  from fastapi import Form
9
- from typing import Optional
 
 
 
 
10
 
11
  router = APIRouter(tags=["files"])
12
 
@@ -20,6 +24,14 @@ os.makedirs(UPLOAD_ROOT, exist_ok=True)
20
  # Minimum length for any file (in characters)
21
  MIN_FILE_LENGTH = 500
22
 
 
 
 
 
 
 
 
 
23
  def validate_pdf(file_path: str) -> bool:
24
  """Validate if file is a valid PDF."""
25
  try:
@@ -218,16 +230,76 @@ async def upload_url(url: str = Form(...)):
218
  Dictionary with status and session_id
219
  """
220
  try:
221
- # Retrieve the content from the URL
222
- response = requests.get(url, timeout=10)
223
- response.raise_for_status() # Raise an exception if the HTTP status is not 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
- # Extract text from HTML with BeautifulSoup
226
- soup = BeautifulSoup(response.text, 'html.parser')
 
 
 
 
 
227
 
228
- # Remove script and style tags
229
- for script in soup(["script", "style"]):
230
- script.extract()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
  # Extract the text
233
  text = soup.get_text()
@@ -237,7 +309,10 @@ async def upload_url(url: str = Form(...)):
237
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
238
  text = '\n'.join(chunk for chunk in chunks if chunk)
239
 
240
- # Limit to 1000 characters if necessary
 
 
 
241
  if len(text) > 25000:
242
  text = text[:25000]
243
 
 
6
  from PyPDF2 import PdfReader
7
  import requests
8
  from fastapi import Form
9
+ from typing import Optional, List
10
+ import re
11
+ from urllib.parse import urlparse
12
+ import html
13
+ import validators
14
 
15
  router = APIRouter(tags=["files"])
16
 
 
24
  # Minimum length for any file (in characters)
25
  MIN_FILE_LENGTH = 500
26
 
27
+ # Configuration des limites de sécurité
28
+ MAX_CONTENT_SIZE = 5 * 1024 * 1024 # 5 MB max pour le contenu téléchargé
29
+ REQUEST_TIMEOUT = 10 # Timeout pour les requêtes HTTP
30
+ # Liste des domaines autorisés (vide = tous autorisés, mais à remplir en production)
31
+ ALLOWED_DOMAINS: List[str] = []
32
+ # Liste d'extensions de fichiers à bloquer dans les URLs
33
+ BLOCKED_EXTENSIONS = ['.exe', '.sh', '.bat', '.dll', '.jar', '.msi']
34
+
35
  def validate_pdf(file_path: str) -> bool:
36
  """Validate if file is a valid PDF."""
37
  try:
 
230
  Dictionary with status and session_id
231
  """
232
  try:
233
+ # Valider que l'URL est bien formée
234
+ if not validators.url(url):
235
+ raise HTTPException(status_code=400, detail="Invalid URL format")
236
+
237
+ # Vérifier si l'URL a une extension bloquée
238
+ parsed_url = urlparse(url)
239
+ path = parsed_url.path.lower()
240
+ if any(path.endswith(ext) for ext in BLOCKED_EXTENSIONS):
241
+ raise HTTPException(status_code=400, detail="This file type is not allowed")
242
+
243
+ # Vérifier si le domaine est autorisé (si la liste n'est pas vide)
244
+ domain = parsed_url.netloc
245
+ if ALLOWED_DOMAINS and domain not in ALLOWED_DOMAINS:
246
+ raise HTTPException(status_code=403, detail="This domain is not in the allowed list")
247
+
248
+ # Retrieve the content from the URL with proper headers to mimic a browser
249
+ headers = {
250
+ 'User-Agent': 'Mozilla/5.0 (compatible; YourBenchBot/1.0; +https://yourbench.example.com)',
251
+ 'Accept': 'text/html,application/xhtml+xml',
252
+ 'Accept-Language': 'en-US,en;q=0.5',
253
+ }
254
 
255
+ response = requests.get(
256
+ url,
257
+ timeout=REQUEST_TIMEOUT,
258
+ headers=headers,
259
+ stream=True # Pour vérifier la taille avant de télécharger tout le contenu
260
+ )
261
+ response.raise_for_status()
262
 
263
+ # Vérifier le Content-Type
264
+ content_type = response.headers.get('Content-Type', '')
265
+ if not content_type.startswith(('text/html', 'text/plain', 'application/xhtml+xml')):
266
+ raise HTTPException(
267
+ status_code=400,
268
+ detail=f"Unsupported content type: {content_type}. Only HTML and text formats are supported."
269
+ )
270
+
271
+ # Vérifier la taille du contenu
272
+ content_length = int(response.headers.get('Content-Length', 0))
273
+ if content_length > MAX_CONTENT_SIZE:
274
+ raise HTTPException(
275
+ status_code=400,
276
+ detail=f"Content too large ({content_length} bytes). Maximum size: {MAX_CONTENT_SIZE} bytes."
277
+ )
278
+
279
+ # Lire le contenu avec une limite de taille
280
+ content = ""
281
+ bytes_read = 0
282
+ for chunk in response.iter_content(chunk_size=8192, decode_unicode=True):
283
+ bytes_read += len(chunk.encode('utf-8') if isinstance(chunk, str) else chunk)
284
+ if bytes_read > MAX_CONTENT_SIZE:
285
+ raise HTTPException(
286
+ status_code=400,
287
+ detail=f"Content exceeded maximum allowed size of {MAX_CONTENT_SIZE} bytes"
288
+ )
289
+ content += chunk if isinstance(chunk, str) else chunk.decode('utf-8', errors='replace')
290
+
291
+ # Extract text from HTML with BeautifulSoup using the lxml parser for better security
292
+ soup = BeautifulSoup(content, 'html.parser')
293
+
294
+ # Remove potentially dangerous elements
295
+ for element in soup(['script', 'style', 'iframe', 'object', 'embed', 'noscript']):
296
+ element.extract()
297
+
298
+ # Remove on* attributes (event handlers) from all tags
299
+ for tag in soup.find_all(True):
300
+ for attr in list(tag.attrs):
301
+ if attr.startswith('on'):
302
+ del tag[attr]
303
 
304
  # Extract the text
305
  text = soup.get_text()
 
309
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
310
  text = '\n'.join(chunk for chunk in chunks if chunk)
311
 
312
+ # Sanitize the text to prevent any potential stored XSS
313
+ text = html.escape(text)
314
+
315
+ # Limit to 25000 characters if necessary
316
  if len(text) > 25000:
317
  text = text[:25000]
318