Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
#!/usr/bin/env python | |
# Script to explore and log the content of the YouRBench test dataset | |
import os | |
from datasets import load_dataset | |
from loguru import logger | |
import json | |
from dotenv import load_dotenv | |
import sys | |
# Load environment variables | |
load_dotenv() | |
# Get Hugging Face token | |
hf_token = os.getenv("HF_TOKEN") | |
if not hf_token: | |
logger.warning("HF_TOKEN not found in .env file. Access to private datasets may be limited.") | |
# Set up logger | |
logger.remove() | |
logger.add( | |
"logs/yourbench_dataset_exploration.log", | |
level="INFO", | |
rotation="10 MB", | |
retention="1 week", | |
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}" | |
) | |
# Add console output | |
logger.add( | |
sys.stdout, | |
level="INFO", | |
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}" | |
) | |
logger.info("Starting YouRBench dataset exploration") | |
try: | |
# Load the dataset | |
dataset_name = "yourbench/yourbench_test" | |
logger.info(f"Loading dataset: {dataset_name}") | |
dataset = load_dataset(dataset_name, token=hf_token) | |
# Log dataset structure | |
logger.info(f"Dataset structure: {dataset}") | |
# Explore each split in the dataset | |
for split_name, split_dataset in dataset.items(): | |
logger.info(f"\n{'='*50}\nExploring split: {split_name}\n{'='*50}") | |
logger.info(f"Number of examples: {len(split_dataset)}") | |
logger.info(f"Features: {split_dataset.features}") | |
# Sample and log a few examples | |
num_samples = min(3, len(split_dataset)) | |
logger.info(f"\nShowing {num_samples} sample examples:") | |
for i in range(num_samples): | |
example = split_dataset[i] | |
# Convert to JSON for better readability | |
example_json = json.dumps(example, indent=2, ensure_ascii=False) | |
logger.info(f"\nExample {i}:\n{example_json}") | |
# Additional dataset statistics | |
if hasattr(split_dataset, 'column_names'): | |
logger.info(f"\nColumn names: {split_dataset.column_names}") | |
# Log count of unique values for categorical columns if not too many | |
for column in split_dataset.column_names: | |
try: | |
if split_dataset.features[column].dtype in ['string', 'bool', 'int32', 'int64']: | |
unique_values = set(split_dataset[column]) | |
if len(unique_values) < 20: # Only if there aren't too many unique values | |
logger.info(f"Unique values in '{column}': {unique_values}") | |
except Exception as e: | |
logger.warning(f"Couldn't analyze column '{column}': {e}") | |
except Exception as e: | |
logger.error(f"Error exploring dataset: {e}") | |
logger.info("Dataset exploration completed") |