Spaces:

yourbench
/

demo

Running on CPU Upgrade

File size: 7,691 Bytes

from fastapi import APIRouter, UploadFile, File, HTTPException
import os
import shutil
import uuid
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader

router = APIRouter(tags=["files"])

# Définir le stockage des fichiers par session (importé dans main.py)
session_files = {}

# Dossier racine pour les uploads
UPLOAD_ROOT = "uploaded_files"
os.makedirs(UPLOAD_ROOT, exist_ok=True)

# Longueur minimale pour tout fichier (en caractères)
MIN_FILE_LENGTH = 500

def validate_pdf(file_path: str) -> bool:
    """Validate if file is a valid PDF."""
    try:
        reader = PdfReader(file_path)
        # Vérifier que le PDF a au moins une page
        if len(reader.pages) == 0:
            return False
            
        # Extraire le texte pour vérifier la longueur
        text = ""
        for page in reader.pages:
            text += page.extract_text()
            
        return len(text) >= MIN_FILE_LENGTH
    except:
        return False

def validate_markdown(file_path: str) -> bool:
    """Validate if file is a valid Markdown file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            # Vérifier longueur minimale et présence d'éléments markdown
            return len(content) >= MIN_FILE_LENGTH and any(marker in content for marker in ['#', '-', '*', '`', '[', '>'])
    except:
        return False

def validate_html(file_path: str) -> bool:
    """Validate if file is a valid HTML file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            # Vérifier longueur minimale et structure HTML
            if len(content) < MIN_FILE_LENGTH:
                return False
            BeautifulSoup(content, 'html.parser')
            return True
    except:
        return False

def validate_txt(file_path: str) -> bool:
    """Validate if file is a valid text file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            return len(content.strip()) >= MIN_FILE_LENGTH
    except:
        return False

# Initialize session files dictionary with pre-calculated documents
precalculated_docs = ["the-bitter-lesson", "hurricane-faq", "pokemon-guide"]

for doc_id in precalculated_docs:
    doc_dir = os.path.join(UPLOAD_ROOT, doc_id)
    if os.path.exists(doc_dir):
        doc_files_dir = os.path.join(doc_dir, "uploaded_files")
        if os.path.exists(doc_files_dir):
            for filename in os.listdir(doc_files_dir):
                if filename.endswith((".pdf", ".txt", ".html", ".md")):
                    file_path = os.path.join(doc_files_dir, filename)
                    session_files[doc_id] = file_path
                    print(f"Added pre-calculated document to session_files: {doc_id} -> {file_path}")
                    break
        else:
            # Search directly in the doc_dir
            for filename in os.listdir(doc_dir):
                if filename.endswith((".pdf", ".txt", ".html", ".md")):
                    file_path = os.path.join(doc_dir, filename)
                    session_files[doc_id] = file_path
                    print(f"Added pre-calculated document to session_files: {doc_id} -> {file_path}")
                    break

@router.post("/upload")
async def upload_file(file: UploadFile = File(...)):
    """
    Upload a file to the server and generate a session ID
    
    Args:
        file: The file to upload
        
    Returns:
        Dictionary with filename, status and session_id
    """
    # Vérifier si le fichier est un PDF, TXT, HTML ou MD
    if not file.filename.endswith(('.pdf', '.txt', '.html', '.md')):
        raise HTTPException(status_code=400, detail="Only PDF, TXT, HTML and MD files are accepted")
    
    # Get the file extension
    file_extension = os.path.splitext(file.filename)[1].lower()
    
    # Generate a session ID for this file
    session_id = str(uuid.uuid4())
    
    # Create the session directory structure
    session_dir = os.path.join(UPLOAD_ROOT, session_id)
    uploaded_files_dir = os.path.join(session_dir, "uploaded_files")
    os.makedirs(uploaded_files_dir, exist_ok=True)
    
    # Create standardized filename
    standardized_filename = f"document{file_extension}"
    
    # Create the full path to save the file
    file_path = os.path.join(uploaded_files_dir, standardized_filename)
    
    # Sauvegarder le fichier
    with open(file_path, "wb") as buffer:
        shutil.copyfileobj(file.file, buffer)
    
    # Valider le fichier selon son type
    is_valid = False
    error_detail = ""
    
    if file_extension == '.pdf':
        try:
            reader = PdfReader(file_path)
            if len(reader.pages) == 0:
                error_detail = "PDF must contain at least one page"
                is_valid = False
            else:
                text = ""
                for page in reader.pages:
                    text += page.extract_text()
                
                if len(text) < MIN_FILE_LENGTH:
                    error_detail = f"PDF contains {len(text)} characters but must contain at least {MIN_FILE_LENGTH}"
                    is_valid = False
                else:
                    is_valid = True
        except:
            error_detail = "Invalid PDF format"
            is_valid = False
    elif file_extension == '.md':
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
                
                if len(content) < MIN_FILE_LENGTH:
                    error_detail = f"Markdown file contains {len(content)} characters but must contain at least {MIN_FILE_LENGTH}"
                    is_valid = False
                elif not any(marker in content for marker in ['#', '-', '*', '`', '[', '>']):
                    error_detail = "Markdown file does not contain any valid Markdown elements"
                    is_valid = False
                else:
                    is_valid = True
        except:
            error_detail = "Invalid Markdown format"
            is_valid = False
    elif file_extension == '.html':
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
                
                if len(content) < MIN_FILE_LENGTH:
                    error_detail = f"HTML file contains {len(content)} characters but must contain at least {MIN_FILE_LENGTH}"
                    is_valid = False
                else:
                    BeautifulSoup(content, 'html.parser')
                    is_valid = True
        except:
            error_detail = "Invalid HTML format"
            is_valid = False
    elif file_extension == '.txt':
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
                content_length = len(content.strip())
                
                if content_length < MIN_FILE_LENGTH:
                    error_detail = f"Text file contains {content_length} characters but must contain at least {MIN_FILE_LENGTH}"
                    is_valid = False
                else:
                    is_valid = True
        except:
            error_detail = "Invalid text format"
            is_valid = False
    
    if not is_valid:
        # Supprimer le fichier invalide
        os.remove(file_path)
        raise HTTPException(status_code=400, detail=error_detail or f"Invalid {file_extension[1:].upper()} file")
    
    # Store file path for later use
    session_files[session_id] = file_path
    
    return {"filename": standardized_filename, "status": "uploaded", "session_id": session_id}