|
import pandas as pd |
|
from pathlib import Path |
|
import logging |
|
from datetime import datetime |
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s | %(message)s' |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
def merge_datasets(): |
|
"""Merge augmented threat dataset with main dataset""" |
|
try: |
|
|
|
logger.info("Loading main dataset...") |
|
main_df = pd.read_csv("dataset/processed/MULTILINGUAL_TOXIC_DATASET_360K_7LANG_FINAL.csv") |
|
logger.info(f"Main dataset: {len(main_df):,} rows") |
|
|
|
|
|
augmented_path = Path("dataset/augmented") |
|
latest_augmented = max(augmented_path.glob("threat_augmented_*.csv")) |
|
logger.info(f"Loading augmented dataset: {latest_augmented.name}") |
|
aug_df = pd.read_csv(latest_augmented) |
|
logger.info(f"Augmented dataset: {len(aug_df):,} rows") |
|
|
|
|
|
logger.info("Standardizing columns...") |
|
aug_df_standardized = pd.DataFrame({ |
|
'comment_text': aug_df['text'], |
|
'toxic': 1, |
|
'severe_toxic': 0, |
|
'obscene': 0, |
|
'threat': 1, |
|
'insult': 0, |
|
'identity_hate': 0, |
|
'lang': 'en' |
|
}) |
|
|
|
|
|
logger.info("Checking for duplicates...") |
|
combined_texts = pd.concat([main_df['comment_text'], aug_df_standardized['comment_text']]) |
|
duplicates = combined_texts.duplicated(keep='first') |
|
duplicate_count = duplicates[len(main_df):].sum() |
|
logger.info(f"Found {duplicate_count} duplicates in augmented data") |
|
|
|
|
|
aug_df_standardized = aug_df_standardized[~duplicates[len(main_df):].values] |
|
logger.info(f"Augmented dataset after duplicate removal: {len(aug_df_standardized):,} rows") |
|
|
|
|
|
merged_df = pd.concat([main_df, aug_df_standardized], ignore_index=True) |
|
logger.info(f"Final merged dataset: {len(merged_df):,} rows") |
|
|
|
|
|
output_path = f"dataset/processed/MULTILINGUAL_TOXIC_DATASET_AUGMENTED.csv" |
|
merged_df.to_csv(output_path, index=False) |
|
logger.info(f"Saved merged dataset to: {output_path}") |
|
|
|
|
|
logger.info("\nDataset Statistics:") |
|
logger.info(f"Original samples: {len(main_df):,}") |
|
logger.info(f"Added threat samples: {len(aug_df_standardized):,}") |
|
logger.info(f"Total samples: {len(merged_df):,}") |
|
logger.info(f"Threat samples in final dataset: {merged_df['threat'].sum():,}") |
|
|
|
return merged_df |
|
|
|
except Exception as e: |
|
logger.error(f"Error merging datasets: {str(e)}") |
|
raise |
|
|
|
if __name__ == "__main__": |
|
merged_df = merge_datasets() |