File size: 2,778 Bytes
39acd70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env python
# Script to explore and log the content of the YouRBench test dataset

import os
from datasets import load_dataset
from loguru import logger
import json
from dotenv import load_dotenv
import sys

# Load environment variables
load_dotenv()

# Get Hugging Face token
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
    logger.warning("HF_TOKEN not found in .env file. Access to private datasets may be limited.")

# Set up logger
logger.remove()
logger.add(
    "logs/yourbench_dataset_exploration.log",
    level="INFO",
    rotation="10 MB",
    retention="1 week",
    format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}"
)
# Add console output
logger.add(
    sys.stdout,
    level="INFO",
    format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}"
)

logger.info("Starting YouRBench dataset exploration")

try:
    # Load the dataset
    dataset_name = "yourbench/yourbench_test"
    logger.info(f"Loading dataset: {dataset_name}")
    dataset = load_dataset(dataset_name, token=hf_token)
    
    # Log dataset structure
    logger.info(f"Dataset structure: {dataset}")
    
    # Explore each split in the dataset
    for split_name, split_dataset in dataset.items():
        logger.info(f"\n{'='*50}\nExploring split: {split_name}\n{'='*50}")
        logger.info(f"Number of examples: {len(split_dataset)}")
        logger.info(f"Features: {split_dataset.features}")
        
        # Sample and log a few examples
        num_samples = min(3, len(split_dataset))
        logger.info(f"\nShowing {num_samples} sample examples:")
        
        for i in range(num_samples):
            example = split_dataset[i]
            # Convert to JSON for better readability
            example_json = json.dumps(example, indent=2, ensure_ascii=False)
            logger.info(f"\nExample {i}:\n{example_json}")
        
        # Additional dataset statistics
        if hasattr(split_dataset, 'column_names'):
            logger.info(f"\nColumn names: {split_dataset.column_names}")
            
            # Log count of unique values for categorical columns if not too many
            for column in split_dataset.column_names:
                try:
                    if split_dataset.features[column].dtype in ['string', 'bool', 'int32', 'int64']:
                        unique_values = set(split_dataset[column])
                        if len(unique_values) < 20:  # Only if there aren't too many unique values
                            logger.info(f"Unique values in '{column}': {unique_values}")
                except Exception as e:
                    logger.warning(f"Couldn't analyze column '{column}': {e}")

except Exception as e:
    logger.error(f"Error exploring dataset: {e}")

logger.info("Dataset exploration completed")