Spaces:

yourbench
/

demo

Running on CPU Upgrade

File size: 6,290 Bytes

970eef1
c750639
970eef1
 
 
 
 
 
 
c750639
 
970eef1
 
 
 
 
 
e64aebd
970eef1
 
e64aebd
970eef1
 
e64aebd
970eef1
 
e64aebd
970eef1
e64aebd
970eef1
 
 
e64aebd
 
970eef1
 
 
 
 
 
 
e64aebd
970eef1
e64aebd
970eef1
 
e64aebd
970eef1
 
 
 
 
 
e64aebd
970eef1
 
e64aebd
970eef1
 
 
 
 
 
 
 
e64aebd
970eef1
 
e64aebd
970eef1
 
e64aebd
970eef1
 
e64aebd
c750639
 
 
 
 
e64aebd
c750639
 
e64aebd
c750639
 
e64aebd
c750639
 
e64aebd
c750639
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e64aebd
c750639
e64aebd
 
c750639
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e64aebd
c750639
 
e64aebd
970eef1

from fastapi import APIRouter, HTTPException
from fastapi.responses import StreamingResponse, JSONResponse
from huggingface_hub import hf_hub_download, snapshot_download
import os
import tempfile
import shutil
import zipfile
import io
import logging
import json
from datasets import load_dataset

router = APIRouter(tags=["download"])

@router.get("/download-dataset/{session_id}")
async def download_dataset(session_id: str):
    """
    Downloads the HuggingFace dataset associated with a session and returns it to the client
    
    Args:
        session_id: Session identifier
        
    Returns:
        ZIP file containing the dataset
    """
    try:
        # Create a temporary directory to store the dataset files
        with tempfile.TemporaryDirectory() as temp_dir:
            # HuggingFace repo identifier
            repo_id = f"yourbench/yourbench_{session_id}"
            
            try:
                # Download the dataset snapshot from HuggingFace
                logging.info(f"Downloading dataset {repo_id}")
                snapshot_path = snapshot_download(
                    repo_id=repo_id,
                    repo_type="dataset",
                    local_dir=temp_dir,
                    token=os.environ.get("HF_TOKEN")
                )
                
                logging.info(f"Dataset downloaded to {snapshot_path}")
                
                # Create a ZIP file in memory
                zip_io = io.BytesIO()
                with zipfile.ZipFile(zip_io, 'w', zipfile.ZIP_DEFLATED) as zip_file:
                    # Loop through all files in the dataset and add them to the ZIP
                    for root, _, files in os.walk(snapshot_path):
                        for file in files:
                            file_path = os.path.join(root, file)
                            arc_name = os.path.relpath(file_path, snapshot_path)
                            zip_file.write(file_path, arcname=arc_name)
                
                # Reset the cursor to the beginning of the stream
                zip_io.seek(0)
                
                # Return the ZIP to the client
                filename = f"yourbench_{session_id}_dataset.zip"
                return StreamingResponse(
                    zip_io,
                    media_type="application/zip",
                    headers={"Content-Disposition": f"attachment; filename={filename}"}
                )
                
            except Exception as e:
                logging.error(f"Error while downloading the dataset: {str(e)}")
                raise HTTPException(
                    status_code=500,
                    detail=f"Error while downloading the dataset: {str(e)}"
                )
    except Exception as e:
        logging.error(f"General error: {str(e)}")
        raise HTTPException(
            status_code=500, 
            detail=f"Error during download: {str(e)}"
        )

@router.get("/download-questions/{session_id}")
async def download_questions(session_id: str):
    """
    Downloads the questions generated for a session in JSON format
    
    Args:
        session_id: Session identifier
        
    Returns:
        JSON file containing only the list of generated questions
    """
    try:
        # HuggingFace repo identifier
        dataset_repo_id = f"yourbench/yourbench_{session_id}"
        
        # Initialize questions list
        all_questions = []
        
        # Try to load single-shot questions
        try:
            single_dataset = load_dataset(dataset_repo_id, 'single_shot_questions')
            if single_dataset and len(single_dataset['train']) > 0:
                for idx in range(len(single_dataset['train'])):
                    all_questions.append({
                        "id": str(idx),
                        "question": single_dataset['train'][idx].get("question", ""),
                        "answer": single_dataset['train'][idx].get("self_answer", "No answer available"),
                        "type": "single_shot"
                    })
                logging.info(f"Loaded {len(all_questions)} single-shot questions")
        except Exception as e:
            logging.error(f"Error loading single-shot questions: {str(e)}")
        
        # Try to load multi-hop questions
        try:
            multi_dataset = load_dataset(dataset_repo_id, 'multi_hop_questions')
            if multi_dataset and len(multi_dataset['train']) > 0:
                start_idx = len(all_questions)
                for idx in range(len(multi_dataset['train'])):
                    all_questions.append({
                        "id": str(start_idx + idx),
                        "question": multi_dataset['train'][idx].get("question", ""),
                        "answer": multi_dataset['train'][idx].get("self_answer", "No answer available"),
                        "type": "multi_hop"
                    })
                logging.info(f"Loaded {len(multi_dataset['train'])} multi-hop questions")
        except Exception as e:
            logging.error(f"Error loading multi-hop questions: {str(e)}")
        
        # If we couldn't load any questions, the dataset might not exist
        if len(all_questions) == 0:
            raise HTTPException(status_code=404, detail="No questions found for this session")
        
        # Convert only the list of questions to JSON (without session_id and without wrapping object)
        questions_json = json.dumps(all_questions, ensure_ascii=False, indent=2)
        
        # Create a BytesIO object with the JSON data
        json_bytes = io.BytesIO(questions_json.encode('utf-8'))
        json_bytes.seek(0)
        
        # Return the JSON file for download
        filename = f"yourbench_{session_id}_questions.json"
        return StreamingResponse(
            json_bytes,
            media_type="application/json",
            headers={"Content-Disposition": f"attachment; filename={filename}"}
        )
            
    except HTTPException:
        # Re-raise HTTP exceptions
        raise
    except Exception as e:
        logging.error(f"Error retrieving questions: {str(e)}")
        raise HTTPException(
            status_code=500, 
            detail=f"Error downloading questions: {str(e)}"
        )