|
import pandas as pd |
|
import os |
|
from pathlib import Path |
|
import json |
|
from datetime import datetime |
|
|
|
def create_dataset_card(file_path): |
|
"""Create a dataset card with key information about the CSV file""" |
|
try: |
|
|
|
df = pd.read_csv(file_path, encoding='utf-8') |
|
|
|
|
|
file_stats = os.stat(file_path) |
|
file_size_mb = file_stats.st_size / (1024 * 1024) |
|
last_modified = datetime.fromtimestamp(file_stats.st_mtime).strftime('%Y-%m-%d %H:%M:%S') |
|
|
|
|
|
card = { |
|
"filename": Path(file_path).name, |
|
"last_modified": last_modified, |
|
"file_size_mb": round(file_size_mb, 2), |
|
"num_rows": len(df), |
|
"num_columns": len(df.columns), |
|
"columns": list(df.columns), |
|
"column_dtypes": df.dtypes.astype(str).to_dict(), |
|
"null_counts": df.isnull().sum().to_dict(), |
|
"sample_rows": df.head(3).to_dict('records') |
|
} |
|
|
|
|
|
if 'lang' in df.columns: |
|
card["language_distribution"] = df['lang'].value_counts().to_dict() |
|
|
|
|
|
toxic_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] |
|
label_stats = {} |
|
for col in toxic_cols: |
|
if col in df.columns: |
|
label_stats[col] = df[col].value_counts().to_dict() |
|
if label_stats: |
|
card["label_distribution"] = label_stats |
|
|
|
return card |
|
|
|
except Exception as e: |
|
return { |
|
"filename": Path(file_path).name, |
|
"error": str(e) |
|
} |
|
|
|
def scan_dataset_directory(directory="dataset"): |
|
"""Scan directory for CSV files and create dataset cards""" |
|
print(f"\nScanning directory: {directory}") |
|
|
|
|
|
csv_files = [] |
|
for root, _, files in os.walk(directory): |
|
for file in files: |
|
if file.endswith('.csv'): |
|
csv_files.append(os.path.join(root, file)) |
|
|
|
if not csv_files: |
|
print("No CSV files found!") |
|
return |
|
|
|
print(f"\nFound {len(csv_files)} CSV files") |
|
|
|
|
|
cards = {} |
|
for file_path in csv_files: |
|
print(f"\nProcessing: {file_path}") |
|
cards[file_path] = create_dataset_card(file_path) |
|
|
|
|
|
output_file = "dataset/dataset_cards.json" |
|
with open(output_file, 'w', encoding='utf-8') as f: |
|
json.dump(cards, f, indent=2, ensure_ascii=False) |
|
|
|
print(f"\n✓ Dataset cards saved to: {output_file}") |
|
|
|
|
|
for file_path, card in cards.items(): |
|
print(f"\n{'='*80}") |
|
print(f"File: {card['filename']}") |
|
if 'error' in card: |
|
print(f"Error: {card['error']}") |
|
continue |
|
|
|
print(f"Size: {card['file_size_mb']:.2f} MB") |
|
print(f"Rows: {card['num_rows']:,}") |
|
print(f"Columns: {', '.join(card['columns'])}") |
|
|
|
if 'language_distribution' in card: |
|
print("\nLanguage Distribution:") |
|
for lang, count in card['language_distribution'].items(): |
|
print(f" {lang}: {count:,}") |
|
|
|
if 'label_distribution' in card: |
|
print("\nLabel Distribution:") |
|
for label, dist in card['label_distribution'].items(): |
|
print(f" {label}: {dist}") |
|
|
|
if __name__ == "__main__": |
|
scan_dataset_directory() |