#!/usr/bin/env python # Script to explore and log the content of the YouRBench test dataset import os from datasets import load_dataset from loguru import logger import json from dotenv import load_dotenv import sys # Load environment variables load_dotenv() # Get Hugging Face token hf_token = os.getenv("HF_TOKEN") if not hf_token: logger.warning("HF_TOKEN not found in .env file. Access to private datasets may be limited.") # Set up logger logger.remove() logger.add( "logs/yourbench_dataset_exploration.log", level="INFO", rotation="10 MB", retention="1 week", format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}" ) # Add console output logger.add( sys.stdout, level="INFO", format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}" ) logger.info("Starting YouRBench dataset exploration") try: # Load the dataset dataset_name = "yourbench/yourbench_test" logger.info(f"Loading dataset: {dataset_name}") dataset = load_dataset(dataset_name, token=hf_token) # Log dataset structure logger.info(f"Dataset structure: {dataset}") # Explore each split in the dataset for split_name, split_dataset in dataset.items(): logger.info(f"\n{'='*50}\nExploring split: {split_name}\n{'='*50}") logger.info(f"Number of examples: {len(split_dataset)}") logger.info(f"Features: {split_dataset.features}") # Sample and log a few examples num_samples = min(3, len(split_dataset)) logger.info(f"\nShowing {num_samples} sample examples:") for i in range(num_samples): example = split_dataset[i] # Convert to JSON for better readability example_json = json.dumps(example, indent=2, ensure_ascii=False) logger.info(f"\nExample {i}:\n{example_json}") # Additional dataset statistics if hasattr(split_dataset, 'column_names'): logger.info(f"\nColumn names: {split_dataset.column_names}") # Log count of unique values for categorical columns if not too many for column in split_dataset.column_names: try: if split_dataset.features[column].dtype in ['string', 'bool', 'int32', 'int64']: unique_values = set(split_dataset[column]) if len(unique_values) < 20: # Only if there aren't too many unique values logger.info(f"Unique values in '{column}': {unique_values}") except Exception as e: logger.warning(f"Couldn't analyze column '{column}': {e}") except Exception as e: logger.error(f"Error exploring dataset: {e}") logger.info("Dataset exploration completed")