File size: 3,620 Bytes
d187b57 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import pandas as pd
import os
from pathlib import Path
import json
from datetime import datetime
def create_dataset_card(file_path):
"""Create a dataset card with key information about the CSV file"""
try:
# Read the CSV file
df = pd.read_csv(file_path, encoding='utf-8')
# Get file info
file_stats = os.stat(file_path)
file_size_mb = file_stats.st_size / (1024 * 1024)
last_modified = datetime.fromtimestamp(file_stats.st_mtime).strftime('%Y-%m-%d %H:%M:%S')
# Create dataset card
card = {
"filename": Path(file_path).name,
"last_modified": last_modified,
"file_size_mb": round(file_size_mb, 2),
"num_rows": len(df),
"num_columns": len(df.columns),
"columns": list(df.columns),
"column_dtypes": df.dtypes.astype(str).to_dict(),
"null_counts": df.isnull().sum().to_dict(),
"sample_rows": df.head(3).to_dict('records')
}
# Add language distribution if 'lang' column exists
if 'lang' in df.columns:
card["language_distribution"] = df['lang'].value_counts().to_dict()
# Add label distribution if any toxic-related columns exist
toxic_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
label_stats = {}
for col in toxic_cols:
if col in df.columns:
label_stats[col] = df[col].value_counts().to_dict()
if label_stats:
card["label_distribution"] = label_stats
return card
except Exception as e:
return {
"filename": Path(file_path).name,
"error": str(e)
}
def scan_dataset_directory(directory="dataset"):
"""Scan directory for CSV files and create dataset cards"""
print(f"\nScanning directory: {directory}")
# Find all CSV files
csv_files = []
for root, _, files in os.walk(directory):
for file in files:
if file.endswith('.csv'):
csv_files.append(os.path.join(root, file))
if not csv_files:
print("No CSV files found!")
return
print(f"\nFound {len(csv_files)} CSV files")
# Create dataset cards
cards = {}
for file_path in csv_files:
print(f"\nProcessing: {file_path}")
cards[file_path] = create_dataset_card(file_path)
# Save to JSON file
output_file = "dataset/dataset_cards.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(cards, f, indent=2, ensure_ascii=False)
print(f"\n✓ Dataset cards saved to: {output_file}")
# Print summary for each file
for file_path, card in cards.items():
print(f"\n{'='*80}")
print(f"File: {card['filename']}")
if 'error' in card:
print(f"Error: {card['error']}")
continue
print(f"Size: {card['file_size_mb']:.2f} MB")
print(f"Rows: {card['num_rows']:,}")
print(f"Columns: {', '.join(card['columns'])}")
if 'language_distribution' in card:
print("\nLanguage Distribution:")
for lang, count in card['language_distribution'].items():
print(f" {lang}: {count:,}")
if 'label_distribution' in card:
print("\nLabel Distribution:")
for label, dist in card['label_distribution'].items():
print(f" {label}: {dist}")
if __name__ == "__main__":
scan_dataset_directory() |