File size: 6,290 Bytes
970eef1
c750639
970eef1
 
 
 
 
 
 
c750639
 
970eef1
 
 
 
 
 
e64aebd
970eef1
 
e64aebd
970eef1
 
e64aebd
970eef1
 
e64aebd
970eef1
e64aebd
970eef1
 
 
e64aebd
 
970eef1
 
 
 
 
 
 
e64aebd
970eef1
e64aebd
970eef1
 
e64aebd
970eef1
 
 
 
 
 
e64aebd
970eef1
 
e64aebd
970eef1
 
 
 
 
 
 
 
e64aebd
970eef1
 
e64aebd
970eef1
 
e64aebd
970eef1
 
e64aebd
c750639
 
 
 
 
e64aebd
c750639
 
e64aebd
c750639
 
e64aebd
c750639
 
e64aebd
c750639
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e64aebd
c750639
e64aebd
 
c750639
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e64aebd
c750639
 
e64aebd
970eef1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from fastapi import APIRouter, HTTPException
from fastapi.responses import StreamingResponse, JSONResponse
from huggingface_hub import hf_hub_download, snapshot_download
import os
import tempfile
import shutil
import zipfile
import io
import logging
import json
from datasets import load_dataset

router = APIRouter(tags=["download"])

@router.get("/download-dataset/{session_id}")
async def download_dataset(session_id: str):
    """
    Downloads the HuggingFace dataset associated with a session and returns it to the client
    
    Args:
        session_id: Session identifier
        
    Returns:
        ZIP file containing the dataset
    """
    try:
        # Create a temporary directory to store the dataset files
        with tempfile.TemporaryDirectory() as temp_dir:
            # HuggingFace repo identifier
            repo_id = f"yourbench/yourbench_{session_id}"
            
            try:
                # Download the dataset snapshot from HuggingFace
                logging.info(f"Downloading dataset {repo_id}")
                snapshot_path = snapshot_download(
                    repo_id=repo_id,
                    repo_type="dataset",
                    local_dir=temp_dir,
                    token=os.environ.get("HF_TOKEN")
                )
                
                logging.info(f"Dataset downloaded to {snapshot_path}")
                
                # Create a ZIP file in memory
                zip_io = io.BytesIO()
                with zipfile.ZipFile(zip_io, 'w', zipfile.ZIP_DEFLATED) as zip_file:
                    # Loop through all files in the dataset and add them to the ZIP
                    for root, _, files in os.walk(snapshot_path):
                        for file in files:
                            file_path = os.path.join(root, file)
                            arc_name = os.path.relpath(file_path, snapshot_path)
                            zip_file.write(file_path, arcname=arc_name)
                
                # Reset the cursor to the beginning of the stream
                zip_io.seek(0)
                
                # Return the ZIP to the client
                filename = f"yourbench_{session_id}_dataset.zip"
                return StreamingResponse(
                    zip_io,
                    media_type="application/zip",
                    headers={"Content-Disposition": f"attachment; filename={filename}"}
                )
                
            except Exception as e:
                logging.error(f"Error while downloading the dataset: {str(e)}")
                raise HTTPException(
                    status_code=500,
                    detail=f"Error while downloading the dataset: {str(e)}"
                )
    except Exception as e:
        logging.error(f"General error: {str(e)}")
        raise HTTPException(
            status_code=500, 
            detail=f"Error during download: {str(e)}"
        )

@router.get("/download-questions/{session_id}")
async def download_questions(session_id: str):
    """
    Downloads the questions generated for a session in JSON format
    
    Args:
        session_id: Session identifier
        
    Returns:
        JSON file containing only the list of generated questions
    """
    try:
        # HuggingFace repo identifier
        dataset_repo_id = f"yourbench/yourbench_{session_id}"
        
        # Initialize questions list
        all_questions = []
        
        # Try to load single-shot questions
        try:
            single_dataset = load_dataset(dataset_repo_id, 'single_shot_questions')
            if single_dataset and len(single_dataset['train']) > 0:
                for idx in range(len(single_dataset['train'])):
                    all_questions.append({
                        "id": str(idx),
                        "question": single_dataset['train'][idx].get("question", ""),
                        "answer": single_dataset['train'][idx].get("self_answer", "No answer available"),
                        "type": "single_shot"
                    })
                logging.info(f"Loaded {len(all_questions)} single-shot questions")
        except Exception as e:
            logging.error(f"Error loading single-shot questions: {str(e)}")
        
        # Try to load multi-hop questions
        try:
            multi_dataset = load_dataset(dataset_repo_id, 'multi_hop_questions')
            if multi_dataset and len(multi_dataset['train']) > 0:
                start_idx = len(all_questions)
                for idx in range(len(multi_dataset['train'])):
                    all_questions.append({
                        "id": str(start_idx + idx),
                        "question": multi_dataset['train'][idx].get("question", ""),
                        "answer": multi_dataset['train'][idx].get("self_answer", "No answer available"),
                        "type": "multi_hop"
                    })
                logging.info(f"Loaded {len(multi_dataset['train'])} multi-hop questions")
        except Exception as e:
            logging.error(f"Error loading multi-hop questions: {str(e)}")
        
        # If we couldn't load any questions, the dataset might not exist
        if len(all_questions) == 0:
            raise HTTPException(status_code=404, detail="No questions found for this session")
        
        # Convert only the list of questions to JSON (without session_id and without wrapping object)
        questions_json = json.dumps(all_questions, ensure_ascii=False, indent=2)
        
        # Create a BytesIO object with the JSON data
        json_bytes = io.BytesIO(questions_json.encode('utf-8'))
        json_bytes.seek(0)
        
        # Return the JSON file for download
        filename = f"yourbench_{session_id}_questions.json"
        return StreamingResponse(
            json_bytes,
            media_type="application/json",
            headers={"Content-Disposition": f"attachment; filename={filename}"}
        )
            
    except HTTPException:
        # Re-raise HTTP exceptions
        raise
    except Exception as e:
        logging.error(f"Error retrieving questions: {str(e)}")
        raise HTTPException(
            status_code=500, 
            detail=f"Error downloading questions: {str(e)}"
        )