Deeptanshuu's picture
Upload folder using huggingface_hub
d187b57 verified
import pandas as pd
from pathlib import Path
import logging
from datetime import datetime
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s | %(message)s'
)
logger = logging.getLogger(__name__)
def merge_datasets():
"""Merge augmented threat dataset with main dataset"""
try:
# Load main dataset
logger.info("Loading main dataset...")
main_df = pd.read_csv("dataset/processed/MULTILINGUAL_TOXIC_DATASET_360K_7LANG_FINAL.csv")
logger.info(f"Main dataset: {len(main_df):,} rows")
# Load augmented dataset
augmented_path = Path("dataset/augmented")
latest_augmented = max(augmented_path.glob("threat_augmented_*.csv"))
logger.info(f"Loading augmented dataset: {latest_augmented.name}")
aug_df = pd.read_csv(latest_augmented)
logger.info(f"Augmented dataset: {len(aug_df):,} rows")
# Standardize columns for augmented data
logger.info("Standardizing columns...")
aug_df_standardized = pd.DataFrame({
'comment_text': aug_df['text'],
'toxic': 1,
'severe_toxic': 0,
'obscene': 0,
'threat': 1,
'insult': 0,
'identity_hate': 0,
'lang': 'en'
})
# Check for duplicates between datasets
logger.info("Checking for duplicates...")
combined_texts = pd.concat([main_df['comment_text'], aug_df_standardized['comment_text']])
duplicates = combined_texts.duplicated(keep='first')
duplicate_count = duplicates[len(main_df):].sum()
logger.info(f"Found {duplicate_count} duplicates in augmented data")
# Remove duplicates from augmented data
aug_df_standardized = aug_df_standardized[~duplicates[len(main_df):].values]
logger.info(f"Augmented dataset after duplicate removal: {len(aug_df_standardized):,} rows")
# Merge datasets
merged_df = pd.concat([main_df, aug_df_standardized], ignore_index=True)
logger.info(f"Final merged dataset: {len(merged_df):,} rows")
# Save merged dataset
output_path = f"dataset/processed/MULTILINGUAL_TOXIC_DATASET_AUGMENTED.csv"
merged_df.to_csv(output_path, index=False)
logger.info(f"Saved merged dataset to: {output_path}")
# Print statistics
logger.info("\nDataset Statistics:")
logger.info(f"Original samples: {len(main_df):,}")
logger.info(f"Added threat samples: {len(aug_df_standardized):,}")
logger.info(f"Total samples: {len(merged_df):,}")
logger.info(f"Threat samples in final dataset: {merged_df['threat'].sum():,}")
return merged_df
except Exception as e:
logger.error(f"Error merging datasets: {str(e)}")
raise
if __name__ == "__main__":
merged_df = merge_datasets()